9/16/2021 Week2
In [1]: import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
sns.set(style='white')
df= pd.read_csv("D:\health care diabetes.csv")
feature_cols=[col for col in df.columns if col != 'Outcome']
# Finding skewness in the features
from scipy.stats import skew
negative_skew=[]
positive_skew=[]
for feature in feature_cols:
print("Skewness of {0} is {1}". format(feature,skew(df[feature])), end='\n')
if skew(df[feature]) <0:
negative_skew.append(feature)
else:
positive_skew.append(feature)
print(end="\n")
print("Negatively skewed Features are {}".format(negative_skew), end='\n')
print("Positively skewed Features are {}".format(positive_skew), end='\n')
print("Negatively skewed feature")
# Percantage of missing values in features
for col in feature_cols:
df[col].replace(0,np.nan,inplace=True)
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns, 'percent_missing':percen
missing_value_df.sort_values(by=['percent_missing'],inplace=True, ascending=False)
missing_value_df.set_index(keys=['column_name'],drop=True)
# Missing value imputaion using mean
for col in feature_cols:
df[col].fillna(int(df[col].mean()),inplace=True)
for col in feature_cols:
if col not in ['BMI','DiabetesPedigreeFunction']:
df[col]=df[col].apply(lambda x:int(x))
#Check the balance of the data by plotting the count of outcomes by their value. Des
plt.figure(figsize=(8,6))
sns.countplot(df['Outcome'])
plt.title("count of outcomes", fontsize=15,loc='center', color='Black')
plt.xlabel("Outcome")
plt.ylabel("Value count")
plt.show()
#Create scatter charts between the pair ofvariables to understand the relationships.
plt.figure(figsize=(15,5))
sns.scatterplot(x='Pregnancies',y='Glucose',data=df,hue='Outcome',palette="Set1")
plt.xlabel('Pregnancies', fontsize=13)
plt.ylabel('Glucose', fontsize=13)
plt.title('grouped scatter plot - Pregnancies vs Glucose',fontsize=16)
plt.legend()
plt.show()
plt.figure(figsize=(15,5))
sns.scatterplot(x='Pregnancies',y='Outcome',data=df,palette="Set1")
plt.xlabel('Pregnancies', fontsize=13)
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 1/7
9/16/2021 Week2
plt.ylabel('Outcome', fontsize=13)
plt.title('grouped scatter plot - Pregnancies vs Outcome',fontsize=16)
plt.show()
sns.pairplot(df, hue='Outcome')
plt.title('grouped scatter plot - All Variables',fontsize=16)
plt.legend()
plt.show()
for i in range(len(feature_cols)):
sns.FacetGrid(df,hue="Outcome",aspect=3,margin_titles=True).map(sns.kdeplot,feature
corr=df.corr()
corr
plt.figure(figsize=(15, 10))
sns.heatmap(corr, annot=True,cmap='RdYlGn', linewidths=0.30)
plt.title("Healthcare Dataset Heatmap")
plt.show()
# Correlation Matrix Heatmap Visualization (should run this code again after removin
sns.set(style="white")
# Generate a mask for the upper triangle
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure to control size of heatmap
fig, ax = plt.subplots(figsize=(8,8))
# Create a custom color palette
cmap = sns.diverging_palette(255, 10, as_cmap=True)
# as_cmap returns a matplotlib colormap object rather than a list of colors
# Red=10, Green=128, Blue=255
# Plot the heatmap
sns.heatmap(df.corr(), mask=mask, annot=True, square=True, cmap=cmap , vmin=-1, vmax
# cannot display corr label
# Prevent Heatmap Cut-Off Issue
bottom, top = ax.get_ylim()
ax.set_ylim(bottom+0.5, top-0.5)
Skewness of Pregnancies is 0.8999119408414357
Skewness of Glucose is 0.17341395519987735
Skewness of BloodPressure is -1.8400052311728738
Skewness of SkinThickness is 0.109158762323673
Skewness of Insulin is 2.2678104585131753
Skewness of BMI is -0.42814327880861786
Skewness of DiabetesPedigreeFunction is 1.9161592037386292
Skewness of Age is 1.127389259531697
Negatively skewed Features are ['BloodPressure', 'BMI']
Positively skewed Features are ['Pregnancies', 'Glucose', 'SkinThickness', 'Insuli
n', 'DiabetesPedigreeFunction', 'Age']
Negatively skewed feature
Project Task: Week 3
Data Modeling:
1. Devise strategies for model building. It is important to decide the right
validation framework. Express your thought process.
2. Apply an appropriate classification algorithm to build a model. Compare
various models with the results from KNN algorithm.
Strategies:-
1. Data modelling is for the prediction of a binary Outcome. Value can be either 0 or 1.
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 2/7
9/16/2021 Week2
2. A supervised ML classification algorithm can be used
3. Logistic regression needs to checked, since it is good for binary classification
4. Tree based algorithms, also needs to be tried out, since the dataset has outliers.
5. Data Scaling must be done, before modeling
6. Holdout method or KFold CV can be used
In [2]: # import the ML algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from statsmodels.tools.eval_measures import rmse
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# pre-processing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
x=df[feature_cols]
y=df['Outcome']
In [3]: # Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_st
In [4]: #Feature Scaling
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)
Apply an appropriate classification algorithm to build a model. Compare
various models with the results from KNN algorithm.
In [5]: #Using Logistic Regression Algorithm to the Training Set
classifier_logreg = LogisticRegression(random_state= 0)
classifier_logreg.fit(x_train_scaled, y_train)
y_pred_logreg=classifier_logreg.predict(x_test_scaled)
print('Accuracy of Logistic regression: {}'.format(accuracy_score(y_test,y_pred_logr
Accuracy of Logistic regression: 0.7705627705627706
In [6]: #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algori
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors =13)
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 3/7
9/16/2021 Week2
classifier_knn.fit(x_train_scaled, y_train)
y_pred_knn=classifier_knn.predict(x_test_scaled)
print('Accuracy of KNN : {}'.format(accuracy_score(y_test,y_pred_knn)))
Accuracy of KNN : 0.7792207792207793
In [7]: #Using SVC method of svm class to use Support Vector Machine Algorithm
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear', random_state= 0)
classifier_svc.fit(x_train_scaled, y_train)
y_pred_svc=classifier_svc.predict(x_test_scaled)
print('Accuracy of SVM-linear: {}'.format(accuracy_score(y_test,y_pred_svc)))
Accuracy of SVM-linear: 0.7662337662337663
In [8]: #Using SVC-Kernel method of svm class to use Support Vector Machine Algorithm
from sklearn.svm import SVC
classifier_svc_rbf = SVC(kernel = 'rbf', random_state = 0,C=1)
classifier_svc_rbf.fit(x_train_scaled, y_train)
y_pred_svc_rbf=classifier_svc_rbf.predict(x_test_scaled)
print('Accuracy of SVM-RBF: {}'.format(accuracy_score(y_test,y_pred_svc_rbf)))
Accuracy of SVM-RBF: 0.7445887445887446
In [9]: #Using Naive Bayes to use Support Vector Machine Algorithm
# GaussianNB
classifier_nb = GaussianNB()
classifier_nb.fit(x_train_scaled, y_train)
y_pred_nb=classifier_nb.predict(x_test_scaled)
print('Accuracy of Naive Bayes-Gaussian: {}'.format(accuracy_score(y_test,y_pred_nb)
Accuracy of Naive Bayes-Gaussian: 0.7532467532467533
In [10]: #Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm
from sklearn.tree import DecisionTreeClassifier
classifier_dec = DecisionTreeClassifier(criterion ='entropy', random_state = 0)
classifier_dec.fit(x_train, y_train)
y_pred_dec=classifier_dec.predict(x_test)
print('Accuracy of Decision Tree Classifier: {}'.format(accuracy_score(y_test,y_pred
Accuracy of Decision Tree Classifier: 0.7445887445887446
In [11]: #Using RandomForestClassifier method of ensemble class to use Random Forest Classifi
from sklearn.ensemble import RandomForestClassifier
classifier_rnd = RandomForestClassifier(n_estimators= 9, criterion = 'entropy', rand
classifier_rnd.fit(x_train_scaled, y_train)
y_pred_rnd=classifier_rnd.predict(x_test_scaled)
print('Accuracy of Random Forest Classifier: {}'.format(accuracy_score(y_test,y_pred
Accuracy of Random Forest Classifier: 0.7878787878787878
Random Forest Model is the best approach
In [12]: feature_imp=sorted(list(zip(feature_cols,classifier_rnd.feature_importances_)),key=l
feature_imp[:10] # Top 10 important features
Out[12]: [('Glucose', 0.2228201379776521),
('Age', 0.1517873256817099),
('BMI', 0.14719738003933125),
('DiabetesPedigreeFunction', 0.13511585970768406),
('SkinThickness', 0.09879866678114017),
('Insulin', 0.09843081839304385),
('Pregnancies', 0.08010429797887891),
('BloodPressure', 0.06574551344055982)]
In [13]: # Plot the top features based on its importance
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 4/7
9/16/2021 Week2
pd.Series(classifier_rnd.feature_importances_,index=x.columns).nlargest(10).plot(kin
plt.title('Top Features derived by Random Forest', size=20)
plt.show()
In [14]: # Feature Selection using RFE
from sklearn.feature_selection import RFE
rfe=RFE(estimator=classifier_logreg,n_features_to_select=4)
rfe.fit(x,y)
for i in sorted(list(zip(feature_cols,rfe.ranking_)), key=lambda x:x[1],reverse=True
print(i)
('Insulin', 5)
('SkinThickness', 4)
('BloodPressure', 3)
('Age', 2)
('Pregnancies', 1)
('Glucose', 1)
('BMI', 1)
('DiabetesPedigreeFunction', 1)
In [15]: pd.Series(rfe.ranking_,index=x.columns).nlargest(10).plot(kind='barh',color='Pink').
plt.title('Top Features derived by RFE', size=20)
plt.show()
In [16]: # Using the selected fetures to predict outcome
features_selected =['Insulin','SkinThickness','Age','BloodPressure','Pregnancies']
x_train_new= x_train[features_selected]
x_test_new=x_test[features_selected]
x_train_scaled_new =sc.fit_transform(x_train_new)
x_test_scaled_new=sc.fit_transform(x_test_new)
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 5/7
9/16/2021 Week2
classifier_rnd_new = RandomForestClassifier(n_estimators = 9, criterion = 'entropy',
classifier_rnd_new.fit(x_train_scaled_new, y_train)
y_pred_rnd_new=classifier_rnd_new.predict(x_test_scaled_new)
print('Accuracy of Random Forest Classifier with Selected Features: {}'.format(accur
Accuracy of Random Forest Classifier with Selected Features: 0.6536796536796536
In [17]: # create a model building fn for easy comparison
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
def baseline_model(model,x_train_scaled,y_train,x_test_scaled,y_test,name):
model.fit(x_train_scaled,y_train)
accuracy = np.mean(cross_val_score(model, x_train_scaled, y_train, cv=kf, scorin
precision = np.mean(cross_val_score(model, x_train_scaled, y_train, cv=kf, scori
recall = np.mean(cross_val_score(model, x_train_scaled, y_train, cv=kf, scoring=
f1score = np.mean(cross_val_score(model, x_train_scaled, y_train, cv=kf, scoring
rocauc = np.mean(cross_val_score(model, x_train_scaled, y_train, cv=kf, scoring=
y_pred = model.predict(x_test_scaled)
df_models = pd.DataFrame({'model' : [name], 'accuracy' : [accuracy], 'precision'
# metrics to be used for comparison later
return df_models
In [18]: df_models=pd.concat([baseline_model(classifier_logreg,x_train_scaled,y_train,x_test_
baseline_model(classifier_knn,x_train_scaled,y_train,x_test_scaled,y_test,'KNN Clas
baseline_model(classifier_svc,x_train_scaled,y_train,x_test_scaled,y_test,'Linear S
baseline_model(classifier_svc_rbf,x_train_scaled,y_train,x_test_scaled,y_test,'SVC-
baseline_model(classifier_nb,x_train_scaled,y_train,x_test_scaled,y_test,'Gaussian
baseline_model(classifier_dec,x_train_scaled,y_train,x_test_scaled,y_test,'Decision
baseline_model(classifier_rnd,x_train_scaled,y_train,x_test_scaled,y_test,'Random F
df_models = df_models.drop('index', axis=1)
df_models
Out[18]: model accuracy precision recall f1score rocauc
0 Logistic Regression 0.767065 0.721360 0.592173 0.647155 0.835375
1 KNN Classifer 0.748529 0.675292 0.587449 0.627657 0.809642
2 Linear SVC 0.763361 0.729954 0.566532 0.634171 0.833505
3 SVC-RBF 0.737349 0.665880 0.556140 0.603961 0.824643
4 Gaussian NB 0.750242 0.668436 0.628475 0.646598 0.810263
5 Decision Tree 0.675891 0.559716 0.541430 0.547777 0.646589
6 Random Forest 0.741035 0.672384 0.561538 0.610556 0.791475
In [19]: ## plot the performance metric scores
fig, ax = plt.subplots(5, 1, figsize=(18, 20))
ax[0].bar(df_models.model, df_models.accuracy)
ax[0].set_title('Accuracy-Cross val score')
ax[1].bar(df_models.model, df_models.precision)
ax[1].set_title('Precision-cross val score')
ax[2].bar(df_models.model, df_models.recall)
ax[2].set_title('Recall-cross val score')
ax[3].bar(df_models.model, df_models.f1score)
ax[3].set_title('F1 score -Cross val score')
ax[4].bar(df_models.model, df_models.rocauc)
ax[4].set_title('ROCAUC -Cross val score')
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 6/7
9/16/2021 Week2
# Fine-tune figure; make subplots farther from each other, or nearer to each other.
fig.subplots_adjust(hspace=0.5, wspace=0.5)
In [ ]:
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 7/7