Assignment 1
Assignment 1
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from warnings import simplefilter
# ignore warnings related to some packages.
simplefilter(action='ignore', category=FutureWarning)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
df= pd.read_csv("breast-cancer-wisconsin.csv")
Pre-processing dataset
In [3]: # checking all the rows which have missing values "?"
df[df.values=='?']
Out[3]:
Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses class
23 8 4 5 1 2 ? 7 3 1 class2
40 6 6 6 9 6 ? 7 8 1 class1
139 1 1 1 1 1 ? 2 1 1 class1
145 1 1 3 1 2 ? 2 1 1 class1
158 1 1 2 1 3 ? 1 1 1 class1
164 5 1 1 1 2 ? 3 1 1 class1
235 3 1 4 1 2 ? 3 1 1 class1
249 3 1 1 1 2 ? 3 1 1 class1
275 3 1 3 1 2 ? 2 1 1 class1
292 8 8 8 1 2 ? 6 10 1 class2
294 1 1 1 1 2 ? 2 1 1 class1
297 5 4 3 1 2 ? 2 3 1 class1
315 4 6 5 6 7 ? 4 9 1 class1
321 3 1 1 1 2 ? 3 1 1 class1
411 1 1 1 1 1 ? 2 1 1 class1
617 1 1 1 1 1 ? 1 1 1 class1
df=df.replace(['?'],np.nan)
In [5]: # checking if ? have been correctly changed to NaNs in DataFrame
df[df.isna().any(axis=1)]
Out[5]:
Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses class
23 8 4 5 1 2 NaN 7 3 1 class2
40 6 6 6 9 6 NaN 7 8 1 class1
In [6]: # changing class1 and class2 to 0 and 1 respectively in the last column of dataframe
In [7]: # Splitting dataset into X(all columns except class) and Y(class column).
X= df.drop(LastColumn, axis=1)
y= df[LastColumn]
imputer= imputer.fit(X)
Xfilled= imputer.transform(X)
In [9]: # Normalisation
scaler= MinMaxScaler()
XNormalised= scaler.fit_transform(Xfilled)
print(XNormalised)
In [10]: # Defining a function to Print the first ten rows of pre-processed dataset to 4 decimal places:
for i in range(n_rows):
if i == len(X)-1:
print(y[i],end="")
else:
print(y[i])
In [11]: #printing the first ten rows of pre-processed dataset to 4 decimal places using the above function:
print_data(XNormalised,y,10)
0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0
# Since XNormalised being passed to the train_test_split function here is already normalised.
# Therefore X_train and y_train are automatically normalised.
# Thus no need to separately normalise the training data sets further.
print(KNNClassifier(XNormalised, y))
0.9671
In [ ]:
In [17]: # creating and applying Logistic regression Classifier to dataset
logreg= LogisticRegression()
return round(scores.mean(),4)
In [19]: # Running the above function for Logistic regression cross-validation score
logregClassifier(XNormalised, y)
Out[19]: 0.9642
In [ ]:
return round(scores.mean(),4)
In [22]: # Running the above function for Naive Bayes cross-validation score
nbClassifier(XNormalised, y)
Out[22]: 0.9585
In [ ]:
return round(scores.mean(),4)
In [25]: # Running the above function for Decision Tree cross-validation score
dtClassifier(XNormalised, y)
Out[25]: 0.9385
In [ ]:
In [26]: # creating and applying Bagging Classifier to dataset
bagC= BaggingClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=0), n_estimators=60,
max_samples=100, bootstrap=True, random_state=0) #load an instance of the classifier.
bagC.fit(X_train, y_train) #creating a model
bagC = BaggingClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0), n_estimators=n_estimators,
max_samples=max_samples, bootstrap=True, random_state=0) #load an instance of the classifier.
return round(scores.mean(),4)
Out[28]: 0.9571
In [ ]:
adaB = AdaBoostClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=6), n_estimators=60, learning_rate=
0.5, random_state=0)
adaB.fit(X_train, y_train) #creating a model
adaB = AdaBoostClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=max_depth), n_estimators=n_estimators, learning_rate=
learning_rate, random_state=0)
return round(scores.mean(),4)
Out[31]: 0.9542
In [ ]:
return round(scores.mean(),4)
0.9571
In [ ]:
Part 1 Results
In [35]: # Parameters for Part 1:
#Bagging
bag_n_estimators = 60
bag_max_samples = 100
bag_max_depth = 6
#AdaBoost
ada_n_estimators = 60
ada_learning_rate = 0.5
ada_bag_max_depth = 6
#GB
gb_n_estimators = 60
gb_learning_rate = 0.5
grid.fit(X_train, y_train)
BestParams=grid.best_params_
if S==1:
return print(BestParams['n_neighbors']) # for KNN best k
elif S==2:
return print(BestParams['p']) # for KNN best p
elif S==3:
return print("{:.4f}".format(grid.best_score_)) # for KNN cross-validation accuracy
elif S==4:
return print("{:.4f}".format(grid.score(X_test, y_test))) # for KNN Test set accuracy
else:
return (print("please input S=1-4 and try again"))
0.9695
0.9543
In [ ]:
SVM.fit(X_train, y_train)
BestParams=SVM.best_params_
if S==1:
return print(BestParams['C']) # SVM best C
elif S==2:
return print(BestParams['gamma']) # SVM best Gamma
elif S==3:
return print("{:.4f}".format(SVM.best_score_)) # SVM cross-validation accuracy
elif S==4:
return print("{:.4f}".format(SVM.score(X_test, y_test))) # SVM test set accuracy
else:
return (print("please input S=1-4 and try again"))
In [43]: # Finding SVM best C using function
bestSVMClassifier(XNormalised, y,1)
0.1
0.9676
0.9714
In [ ]:
RF.fit(X_train, y_train)
PredictionRF = RF.predict(X_test)
RF.fit(X_train,y_train)
BestParams=RF.best_params_
actual = y_test
predicted = RF.predict(X_test)
if S==1:
return print(BestParams['n_estimators']) #RF best n_estimators
elif S==2:
return print(BestParams['max_leaf_nodes']) #RF best max_leaf_nodes
elif S==3:
return print("{:.4f}".format(RF.best_score_)) #RF cross-validation accuracy
elif S==4:
return print("{:.4f}".format(RF.score(X_test, y_test))) #RF Test set accuracy
elif S==5:
return print("{:.4f}".format(f1_score(actual, predicted, average='macro'))) #RF test set macro average F1
elif S==6:
return print("{:.4f}".format(f1_score(actual, predicted, average='weighted'))) #RF test set weighted average F1
else:
return (print("please input S=1-4 and try again"))
150
6
In [51]: # finding RF cross-validation accuracy using fucntion
bestRFClassifier(XNormalised, y,3)
0.9675
0.9657
0.9628
0.9661
Part 2: Results
In [55]:
# printing the results using fucntions defined above
print()
KNN best k: 3
KNN best p: 1
KNN cross-validation accuracy: 0.9695
KNN test set accuracy: 0.9543
SVM best C: 5
SVM best gamma: 0.1
SVM cross-validation accuracy: 0.9676
SVM test set accuracy: 0.9714
In [ ]: