0% found this document useful (0 votes)
10 views41 pages

Solution

Uploaded by

Minh Khổng
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views41 pages

Solution

Uploaded by

Minh Khổng
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 41

zswlboufr

December 3, 2024

1 BINARY CLASSIFICATION
[62]: # Importing Libraries

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold,␣
↪StratifiedShuffleSplit

from sklearn.linear_model import LogisticRegression


from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_absolute_error, accuracy_score,␣


↪classification_report, confusion_matrix

[2]: # Setting the numpy random seed

np.random.seed(37)

[157]: # Loading Dataset

df = pd.read_csv('attrition_data.csv')

1
print('Dataframe shape: ', df.shape)

Dataframe shape: (9612, 27)

1.1 DATA CLEANUP


[158]: df.head()

[158]: EMP_ID ANNUAL_RATE HRLY_RATE JOBCODE ETHNICITY SEX MARITAL_STATUS \


0 3285941608 33615 22 71850 BLACK F Divorced
1 3687079832 70675 40 59806 ASIAN M Single
2 7209970080 34320 23 60311 WHITE F Single
3 9084013977 103199 59 16233 ASIAN F Single
4 4566148978 141801 71 64415 ASIAN F Single

JOB_SATISFACTION AGE NUMBER_OF_TEAM_CHANGED … DISABLED_EMP \


0 4 35 3+ … N
1 3 18 3+ … N
2 5 18 2 … N
3 2 50 0 … N
4 4 34 3 … N

DISABLED_VET EDUCATION_LEVEL STATUS JOB_GROUP \


0 N LEVEL 2 T Plant & Facilities Maintenance
1 N LEVEL 1 A Customer Care
2 N LEVEL 1 A Customer Care
3 N LEVEL 1 T Finance
4 N LEVEL 1 A Marketing - Direct

PREVYR_1 PREVYR_2 PREVYR_3 PREVYR_4 PREVYR_5


0 0 0 0 0 0
1 3 3 3 2 3
2 3 3 3 2 3
3 0 0 0 0 0
4 2 2 2 2 2

[5 rows x 27 columns]

[159]: # Checking for missing values

df.isnull().sum()

[159]: EMP_ID 0
ANNUAL_RATE 0
HRLY_RATE 0
JOBCODE 0
ETHNICITY 0

2
SEX 0
MARITAL_STATUS 0
JOB_SATISFACTION 0
AGE 0
NUMBER_OF_TEAM_CHANGED 0
REFERRAL_SOURCE 445
HIRE_MONTH 0
REHIRE 0
TERMINATION_YEAR 5394
IS_FIRST_JOB 0
TRAVELLED_REQUIRED 0
PERFORMANCE_RATING 0
DISABLED_EMP 0
DISABLED_VET 0
EDUCATION_LEVEL 0
STATUS 0
JOB_GROUP 0
PREVYR_1 0
PREVYR_2 0
PREVYR_3 0
PREVYR_4 0
PREVYR_5 0
dtype: int64

[160]: # Dropping irrelevant columns

df.drop(['EMP_ID', 'JOBCODE', 'TERMINATION_YEAR'], axis=1, inplace=True)


df.drop(df.iloc[:, -5:], axis=1, inplace=True)

[161]: df['REFERRAL_SOURCE'].fillna(df['REFERRAL_SOURCE'].mode()[0], inplace=True)

[162]: df.head()

[162]: ANNUAL_RATE HRLY_RATE ETHNICITY SEX MARITAL_STATUS JOB_SATISFACTION AGE \


0 33615 22 BLACK F Divorced 4 35
1 70675 40 ASIAN M Single 3 18
2 34320 23 WHITE F Single 5 18
3 103199 59 ASIAN F Single 2 50
4 141801 71 ASIAN F Single 4 34

NUMBER_OF_TEAM_CHANGED REFERRAL_SOURCE HIRE_MONTH REHIRE \


0 3+ Client Referral June False
1 3+ Executive Referral June False
2 2 Former Employee/Intern February False
3 0 Direct Sourcing October True
4 3 Inroads May False

3
IS_FIRST_JOB TRAVELLED_REQUIRED PERFORMANCE_RATING DISABLED_EMP \
0 N N 4 N
1 Y N 3 N
2 Y N 3 N
3 N Y 2 N
4 N N 4 N

DISABLED_VET EDUCATION_LEVEL STATUS JOB_GROUP


0 N LEVEL 2 T Plant & Facilities Maintenance
1 N LEVEL 1 A Customer Care
2 N LEVEL 1 A Customer Care
3 N LEVEL 1 T Finance
4 N LEVEL 1 A Marketing - Direct

1.2 VISUALIZATIONS
[211]: sns.set(style="darkgrid")
ax = sns.countplot(x="STATUS", data=df, palette=sns.xkcd_palette(["azure",␣
↪"light red"]))

plt.xlabel('Status')
plt.ylabel('Count')
plt.savefig('./plots/status_count.png')
plt.show()

4
[166]: fig=plt.figure(figsize=(8,4))
for x in ['T','A']:
df['AGE'][df['STATUS']==x].plot(kind='kde')

plt.title('Status V/S Age Density Distribution')


plt.legend(('T','A'))
plt.xlabel('Age')
plt.savefig('./plots/status_age_distribution.png')
plt.show()

[209]: sns.countplot(x='PERFORMANCE_RATING', data=df, hue='STATUS', palette=sns.


↪xkcd_palette(["azure", "light red"]))

plt.title("Performance Rating Count Plot")


plt.xlabel('Performance Rating')
plt.ylabel('Count')
plt.savefig('./plots/performance_count.png')
plt.show()

5
[208]: sns.countplot(x='JOB_SATISFACTION', data=df, hue='STATUS', palette=sns.
↪xkcd_palette(["aqua", "periwinkle"]))

plt.title("Job Satisfaction Count Plot")


plt.xlabel('Job Satisfaction')
plt.ylabel('Count')
plt.savefig('./plots/satisfaction_count.png')
plt.show()

6
[207]: sns.boxplot(x='JOB_SATISFACTION',data=df,hue='STATUS',y='AGE', palette=sns.
↪xkcd_palette(["pastel purple", "pastel yellow"]))

plt.title("Job Satisfaction and Age Boxplot")


plt.xlabel('Job Satisfaction')
plt.ylabel('Age')
plt.savefig('./plots/age_satisfaction_box.png')
plt.show()

7
1.3 FEATURE ENGINEERING
[11]: # Label Encoding categorical features

le = LabelEncoder()
df['NUMBER_OF_TEAM_CHANGED'] = le.fit_transform(df['NUMBER_OF_TEAM_CHANGED'])
df['REHIRE'] = le.fit_transform(df['REHIRE'])
df['IS_FIRST_JOB'] = le.fit_transform(df['IS_FIRST_JOB'])
df['TRAVELLED_REQUIRED'] = le.fit_transform(df['TRAVELLED_REQUIRED'])
df['DISABLED_EMP'] = le.fit_transform(df['DISABLED_EMP'])
df['DISABLED_VET'] = le.fit_transform(df['DISABLED_VET'])
df['EDUCATION_LEVEL'] = le.fit_transform(df['EDUCATION_LEVEL'])
df['STATUS'] = le.fit_transform(df['STATUS'])

[12]: # Correlation Heatmap

fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(df.corr(), annot = True, ax=ax)
plt.savefig('./plots/correlation_heatmap.png')

8
We see that HRLY_RATE and ANNUAL_RATE are highly correlated with correla-
tion of 1, so we can take the ANNUAL_RATE and discard HRLY_RATE
[13]: df.drop(['HRLY_RATE'], axis=1, inplace=True)

[14]: # One-Hot Encoding categorical features

df['HIRE_MONTH'] = df['HIRE_MONTH'].astype('category')
df['JOB_GROUP'] = df['JOB_GROUP'].astype('category')
df['REFERRAL_SOURCE'] = df['REFERRAL_SOURCE'].astype('category')
df['ETHNICITY'] = df['ETHNICITY'].astype('category')
df['SEX'] = df['SEX'].astype('category')
df['MARITAL_STATUS'] = df['MARITAL_STATUS'].astype('category')
df = pd.get_dummies(df, columns=['HIRE_MONTH', 'JOB_GROUP', 'REFERRAL_SOURCE',␣
↪'SEX', 'MARITAL_STATUS', 'ETHNICITY'])

[15]: # X = features & y = Target class

X = df.drop(['STATUS'], axis=1)
y = df['STATUS']

9
[16]: # Normalizing the all the features

scaler = StandardScaler()

X = scaler.fit_transform(X)

[17]: # Splitting dataset into training and testing split with 70-30% ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,␣


↪random_state=42)

[18]: # K-fold splits

cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)

1.4 MODELLING
1.4.1 Logistic Regression

[19]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'penalty': ['l1','l2'],
'C': [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10]
}

# Building model
logreg = LogisticRegression(solver='liblinear')

# Parameter estimating using GridSearch


grid = GridSearchCV(logreg, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

# Fitting the model


grid.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 6.4s
[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 29.2s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 1.2min finished

[19]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),

10
error_score=nan,
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2',
random_state=None, solver='liblinear',
tol=0.0001, verbose=0,
warm_start=False),
iid='deprecated', n_jobs=-1,
param_grid={'C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10],
'penalty': ['l1', 'l2']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[20]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.6857355126300149


Best Params: {'C': 0.2, 'penalty': 'l1'}
Best Estimator: LogisticRegression(C=0.2, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)

[21]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

logreg_grid = grid.best_estimator_
y_pred = logreg_grid.predict(X_test)

[22]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[22]: Predicted A Predicted T


Actual A 1288 337
Actual T 516 743

[23]: # Calculating metrics

logreg_grid_score = accuracy_score(y_test, y_pred)

11
print('Model Accuracy:', logreg_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.7042302357836339


Classification Report:
precision recall f1-score support

0 0.71 0.79 0.75 1625


1 0.69 0.59 0.64 1259

accuracy 0.70 2884


macro avg 0.70 0.69 0.69 2884
weighted avg 0.70 0.70 0.70 2884

1.4.2 K-Nearest Neighbor Classifier(KNN)

[24]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'n_neighbors': [3,5,11,19],
'weights': ['uniform','distance']
}

# Building model
knn = KNeighborsClassifier()

# Parameter estimating using GridSearch


grid = GridSearchCV(knn, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

# Fitting the model


grid.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 31.7s
[Parallel(n_jobs=-1)]: Done 80 out of 80 | elapsed: 1.0min finished

[24]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',

12
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='deprecated', n_jobs=-1,
param_grid={'n_neighbors': [3, 5, 11, 19],
'weights': ['uniform', 'distance']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[25]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.6381376919266962


Best Params: {'n_neighbors': 19, 'weights': 'uniform'}
Best Estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=19, p=2,
weights='uniform')

[26]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

knn_grid= grid.best_estimator_
y_pred = knn_grid.predict(X_test)

[27]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[27]: Predicted A Predicted T


Actual A 1218 407
Actual T 603 656

[28]: # Calculating metrics

knn_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', knn_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.6497919556171984


Classification Report:
precision recall f1-score support

0 0.67 0.75 0.71 1625


1 0.62 0.52 0.57 1259

13
accuracy 0.65 2884
macro avg 0.64 0.64 0.64 2884
weighted avg 0.65 0.65 0.64 2884

1.4.3 Gaussian Naive Bayes

[29]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# No such parameters for Gaussian Naive Bayes


params = {}

# Building model
gb = GaussianNB()

# Parameter estimating using GridSearch


grid = GridSearchCV(gb, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

# Fitting the model


grid.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 0.2s finished

[29]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=GaussianNB(priors=None, var_smoothing=1e-09),
iid='deprecated', n_jobs=-1, param_grid={},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[30]: print('Best Score:', grid.best_score_)


print('Best Estimator:', grid.best_estimator_)

Best Score: 0.5548291233283804


Best Estimator: GaussianNB(priors=None, var_smoothing=1e-09)

[31]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

gb_grid= grid.best_estimator_

14
y_pred = gb_grid.predict(X_test)

[32]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[32]: Predicted A Predicted T


Actual A 1221 404
Actual T 581 678

[33]: # Calculating metrics

gb_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', gb_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.6584604715672677


Classification Report:
precision recall f1-score support

0 0.68 0.75 0.71 1625


1 0.63 0.54 0.58 1259

accuracy 0.66 2884


macro avg 0.65 0.64 0.65 2884
weighted avg 0.66 0.66 0.65 2884

1.4.4 Support Vector Machines


[34]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'C': [0.001, 0.01, 0.1, 1, 10],
'gamma' : [0.001,0.001, 0.01, 0.1, 1]
}

# Building model
svc = SVC(kernel='rbf', probability=True) ## 'rbf' stands for gaussian kernel

# Parameter estimating using GridSearch


grid = GridSearchCV(svc, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

15
# Fitting the model
grid.fit(X_train, y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 9.1min
[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 40.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 51.7min finished

[34]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=SVC(C=1.0, break_ties=False, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='scale', kernel='rbf', max_iter=-1,
probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='deprecated', n_jobs=-1,
param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
'gamma': [0.001, 0.001, 0.01, 0.1, 1]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[35]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.6701337295690937


Best Params: {'C': 1, 'gamma': 0.01}
Best Estimator: SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
verbose=False)

[36]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

svm_grid= grid.best_estimator_
y_pred = svm_grid.predict(X_test)

[37]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

16
[37]: Predicted A Predicted T
Actual A 1309 316
Actual T 609 650

[38]: # Calculating metrics

svm_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', svm_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.6792649098474342


Classification Report:
precision recall f1-score support

0 0.68 0.81 0.74 1625


1 0.67 0.52 0.58 1259

accuracy 0.68 2884


macro avg 0.68 0.66 0.66 2884
weighted avg 0.68 0.68 0.67 2884

1.4.5 Decision Tree Classifier


[39]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'max_features': [1, 3, 10],
'min_samples_split': [2, 3, 10],
'min_samples_leaf': [1, 3, 10],
'criterion': ["entropy", "gini"]
}

# Building model
dtc = DecisionTreeClassifier()

# Parameter estimating using GridSearch


grid = GridSearchCV(dtc, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

# Fitting the model


grid.fit(X_train, y_train)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

17
[Parallel(n_jobs=-1)]: Done 280 tasks | elapsed: 2.3s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 4.1s finished

[39]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort='deprecated',
random_state=None,
splitter='best'),
iid='deprecated', n_jobs=-1,
param_grid={'criterion': ['entropy', 'gini'],
'max_features': [1, 3, 10],
'min_samples_leaf': [1, 3, 10],
'min_samples_split': [2, 3, 10]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[40]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.6505695889053987


Best Params: {'criterion': 'gini', 'max_features': 10, 'min_samples_leaf': 10,
'min_samples_split': 2}
Best Estimator: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
criterion='gini',
max_depth=None, max_features=10, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=10, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')

[41]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

dtc_grid= grid.best_estimator_
y_pred = dtc_grid.predict(X_test)

18
[42]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[42]: Predicted A Predicted T


Actual A 1236 389
Actual T 629 630

[43]: # Calculating metrics

dtc_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', dtc_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.6470180305131762


Classification Report:
precision recall f1-score support

0 0.66 0.76 0.71 1625


1 0.62 0.50 0.55 1259

accuracy 0.65 2884


macro avg 0.64 0.63 0.63 2884
weighted avg 0.64 0.65 0.64 2884

1.4.6 Random Forest Classifier


[44]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'max_features': [1, 3, 10],
'min_samples_split': [2, 3, 10],
'min_samples_leaf': [1, 3, 10],
'bootstrap': [False],
'n_estimators' :[100,300],
'criterion': ["entropy", "gini"]
}

# Building model
rfc = RandomForestClassifier()

# Parameter estimating using GridSearch

19
grid = GridSearchCV(rfc, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

# Fitting the model


grid.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 27.3s
[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks | elapsed: 5.5min
[Parallel(n_jobs=-1)]: Done 792 tasks | elapsed: 9.8min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 15.1min finished

[44]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight=None,
criterion='gini', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=…
n_estimators=100, n_jobs=None,
oob_score=False,
random_state=None, verbose=0,
warm_start=False),
iid='deprecated', n_jobs=-1,
param_grid={'bootstrap': [False], 'criterion': ['entropy', 'gini'],
'max_features': [1, 3, 10],
'min_samples_leaf': [1, 3, 10],
'min_samples_split': [2, 3, 10],
'n_estimators': [100, 300]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[45]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.7094105993065873


Best Params: {'bootstrap': False, 'criterion': 'gini', 'max_features': 10,

20
'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 300}
Best Estimator: RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
class_weight=None,
criterion='gini', max_depth=None, max_features=10,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=3, min_samples_split=3,
min_weight_fraction_leaf=0.0, n_estimators=300,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)

[46]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

rfc_grid= grid.best_estimator_
y_pred = rfc_grid.predict(X_test)

[47]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[47]: Predicted A Predicted T


Actual A 1387 238
Actual T 573 686

[48]: # Calculating metrics

rfc_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', rfc_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.7187933425797504


Classification Report:
precision recall f1-score support

0 0.71 0.85 0.77 1625


1 0.74 0.54 0.63 1259

accuracy 0.72 2884


macro avg 0.73 0.70 0.70 2884
weighted avg 0.72 0.72 0.71 2884

21
1.4.7 Artificial Neural Networks
[53]: # Defining our neural network model

def create_model(optimizer='adam'):
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer=optimizer,␣
↪metrics=['accuracy'])

return model

[54]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'optimizer': ['rmsprop', 'adam'],
'epochs': [100, 200, 400],
'batch_size': [5, 10, 20]
}

# Building model
nn = KerasClassifier(build_fn=create_model)

# Parameter estimating using GridSearch


grid = GridSearchCV(nn, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

# Fitting the model


grid.fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 140.9min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 454.2min finished
Train on 6728 samples
Epoch 1/200
6728/6728 [==============================] - 5s 677us/sample - loss: 0.6487 -
accuracy: 0.6141
Epoch 2/200

22
6728/6728 [==============================] - 3s 492us/sample - loss: 0.5849 -
accuracy: 0.6794
Epoch 3/200
6728/6728 [==============================] - 3s 415us/sample - loss: 0.5567 -
accuracy: 0.7060
Epoch 4/200
6728/6728 [==============================] - 3s 393us/sample - loss: 0.5382 -
accuracy: 0.7265 - loss: 0.542
Epoch 5/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.5150 -
accuracy: 0.7438
Epoch 6/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.4840 -
accuracy: 0.7661
Epoch 7/200
6728/6728 [==============================] - 3s 425us/sample - loss: 0.4540 -
accuracy: 0.7814
Epoch 8/200
6728/6728 [==============================] - 3s 458us/sample - loss: 0.4238 -
accuracy: 0.7995
Epoch 9/200
6728/6728 [==============================] - 3s 445us/sample - loss: 0.3966 -
accuracy: 0.8194
Epoch 10/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.3569 -
accuracy: 0.8396
Epoch 11/200
6728/6728 [==============================] - 3s 394us/sample - loss: 0.3430 -
accuracy: 0.8465
Epoch 12/200
6728/6728 [==============================] - 3s 402us/sample - loss: 0.3190 -
accuracy: 0.8558
Epoch 13/200
6728/6728 [==============================] - 3s 407us/sample - loss: 0.2964 -
accuracy: 0.8680 - loss: 0.2926 - ac
Epoch 14/200
6728/6728 [==============================] - 3s 405us/sample - loss: 0.2785 -
accuracy: 0.8757
Epoch 15/200
6728/6728 [==============================] - 3s 395us/sample - loss: 0.2645 -
accuracy: 0.8841
Epoch 16/200
6728/6728 [==============================] - 3s 462us/sample - loss: 0.2387 -
accuracy: 0.8957
Epoch 17/200
6728/6728 [==============================] - 6s 850us/sample - loss: 0.2291 -
accuracy: 0.8994
Epoch 18/200

23
6728/6728 [==============================] - 4s 600us/sample - loss: 0.2272 -
accuracy: 0.8971
Epoch 19/200
6728/6728 [==============================] - 3s 470us/sample - loss: 0.2058 -
accuracy: 0.9098 - ETA: 0s - loss: 0.2040 - accu
Epoch 20/200
6728/6728 [==============================] - 3s 491us/sample - loss: 0.1973 -
accuracy: 0.9154
Epoch 21/200
6728/6728 [==============================] - 3s 477us/sample - loss: 0.1977 -
accuracy: 0.9147 - loss: 0.151 - ETA: 1s
Epoch 22/200
6728/6728 [==============================] - 4s 640us/sample - loss: 0.1887 -
accuracy: 0.9165
Epoch 23/200
6728/6728 [==============================] - 3s 404us/sample - loss: 0.1813 -
accuracy: 0.9220
Epoch 24/200
6728/6728 [==============================] - 3s 465us/sample - loss: 0.1764 -
accuracy: 0.9239 - loss: 0.1689
Epoch 25/200
6728/6728 [==============================] - 5s 810us/sample - loss: 0.1714 -
accuracy: 0.9232
Epoch 26/200
6728/6728 [==============================] - 4s 543us/sample - loss: 0.1579 -
accuracy: 0.9304
Epoch 27/200
6728/6728 [==============================] - 4s 554us/sample - loss: 0.1702 -
accuracy: 0.9255
Epoch 28/200
6728/6728 [==============================] - 7s 1ms/sample - loss: 0.1614 -
accuracy: 0.9288
Epoch 29/200
6728/6728 [==============================] - 4s 590us/sample - loss: 0.1584 -
accuracy: 0.9287 - loss: 0.1573 - accuracy
Epoch 30/200
6728/6728 [==============================] - 3s 483us/sample - loss: 0.1485 -
accuracy: 0.9349
Epoch 31/200
6728/6728 [==============================] - 6s 850us/sample - loss: 0.1302 -
accuracy: 0.9383
Epoch 32/200
6728/6728 [==============================] - 4s 604us/sample - loss: 0.1398 -
accuracy: 0.9364
Epoch 33/200
6728/6728 [==============================] - 4s 624us/sample - loss: 0.1445 -
accuracy: 0.9322
Epoch 34/200

24
6728/6728 [==============================] - 4s 540us/sample - loss: 0.1472 -
accuracy: 0.9342
Epoch 35/200
6728/6728 [==============================] - 3s 448us/sample - loss: 0.1289 -
accuracy: 0.9410
Epoch 36/200
6728/6728 [==============================] - 5s 723us/sample - loss: 0.1243 -
accuracy: 0.9435
Epoch 37/200
6728/6728 [==============================] - 4s 537us/sample - loss: 0.1274 -
accuracy: 0.9394
Epoch 38/200
6728/6728 [==============================] - 3s 491us/sample - loss: 0.1331 -
accuracy: 0.9404
Epoch 39/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.1216 -
accuracy: 0.9431 - loss:
Epoch 40/200
6728/6728 [==============================] - 4s 531us/sample - loss: 0.1072 -
accuracy: 0.9471
Epoch 41/200
6728/6728 [==============================] - 4s 594us/sample - loss: 0.1289 -
accuracy: 0.9460 -
Epoch 42/200
6728/6728 [==============================] - 3s 461us/sample - loss: 0.1174 -
accuracy: 0.9556 - loss: 0.1113 - accura
Epoch 43/200
6728/6728 [==============================] - 3s 429us/sample - loss: 0.1092 -
accuracy: 0.9615
Epoch 44/200
6728/6728 [==============================] - 3s 424us/sample - loss: 0.0896 -
accuracy: 0.9669
Epoch 45/200
6728/6728 [==============================] - 3s 417us/sample - loss: 0.1343 -
accuracy: 0.9550
Epoch 46/200
6728/6728 [==============================] - 3s 426us/sample - loss: 0.0983 -
accuracy: 0.9646
Epoch 47/200
6728/6728 [==============================] - 3s 418us/sample - loss: 0.0852 -
accuracy: 0.9718
Epoch 48/200
6728/6728 [==============================] - 3s 409us/sample - loss: 0.0904 -
accuracy: 0.9694
Epoch 49/200
6728/6728 [==============================] - 4s 593us/sample - loss: 0.0851 -
accuracy: 0.9695 - loss: 0
Epoch 50/200

25
6728/6728 [==============================] - 4s 626us/sample - loss: 0.1118 -
accuracy: 0.9600
Epoch 51/200
6728/6728 [==============================] - 3s 428us/sample - loss: 0.0906 -
accuracy: 0.9691
Epoch 52/200
6728/6728 [==============================] - 3s 410us/sample - loss: 0.0775 -
accuracy: 0.9740
Epoch 53/200
6728/6728 [==============================] - 3s 407us/sample - loss: 0.0877 -
accuracy: 0.9697 - loss: 0.0782
Epoch 54/200
6728/6728 [==============================] - 3s 420us/sample - loss: 0.0911 -
accuracy: 0.9667 - loss: 0.0698 - accuracy - ETA: 2s - loss: 0 - ETA: 1s - loss:
0.081 - E - ETA: 0s - loss: 0.0927 - accuracy: 0.
Epoch 55/200
6728/6728 [==============================] - 3s 449us/sample - loss: 0.0729 -
accuracy: 0.9749
Epoch 56/200
6728/6728 [==============================] - 4s 554us/sample - loss: 0.0752 -
accuracy: 0.9749 - loss: 0.055 - ETA: 2s - los - ETA: 1s - loss: 0.0 - ETA:
Epoch 57/200
6728/6728 [==============================] - 3s 438us/sample - loss: 0.0780 -
accuracy: 0.9728 - ETA: 1s - loss: 0.0691
Epoch 58/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0929 -
accuracy: 0.9688
Epoch 59/200
6728/6728 [==============================] - 3s 410us/sample - loss: 0.0818 -
accuracy: 0.9718
Epoch 60/200
6728/6728 [==============================] - 3s 400us/sample - loss: 0.0631 -
accuracy: 0.9802
Epoch 61/200
6728/6728 [==============================] - 3s 405us/sample - loss: 0.0811 -
accuracy: 0.9727 - loss: 0.0812 - accuracy: 0.97
Epoch 62/200
6728/6728 [==============================] - 3s 423us/sample - loss: 0.0661 -
accuracy: 0.9782
Epoch 63/200
6728/6728 [==============================] - 3s 399us/sample - loss: 0.0590 -
accuracy: 0.9790 - l
Epoch 64/200
6728/6728 [==============================] - 3s 399us/sample - loss: 0.0756 -
accuracy: 0.9731 - loss: 0.0
Epoch 65/200
6728/6728 [==============================] - 3s 410us/sample - loss: 0.0725 -
accuracy: 0.9768 - loss: 0.0749 - accuracy: 0. - ETA:

26
Epoch 66/200
6728/6728 [==============================] - 3s 445us/sample - loss: 0.0720 -
accuracy: 0.9746
Epoch 67/200
6728/6728 [==============================] - 3s 408us/sample - loss: 0.0666 -
accuracy: 0.9787 - loss: 0.0620 - accu
Epoch 68/200
6728/6728 [==============================] - 3s 431us/sample - loss: 0.0582 -
accuracy: 0.9804
Epoch 69/200
6728/6728 [==============================] - 3s 458us/sample - loss: 0.0618 -
accuracy: 0.9811
Epoch 70/200
6728/6728 [==============================] - 3s 422us/sample - loss: 0.0572 -
accuracy: 0.9819 - loss: 0.0404 - ETA: -
Epoch 71/200
6728/6728 [==============================] - 3s 447us/sample - loss: 0.0642 -
accuracy: 0.9798
Epoch 72/200
6728/6728 [==============================] - 3s 489us/sample - loss: 0.0562 -
accuracy: 0.9802
Epoch 73/200
6728/6728 [==============================] - 3s 490us/sample - loss: 0.0744 -
accuracy: 0.9774
Epoch 74/200
6728/6728 [==============================] - 3s 411us/sample - loss: 0.0603 -
accuracy: 0.9807
Epoch 75/200
6728/6728 [==============================] - 3s 407us/sample - loss: 0.0545 -
accuracy: 0.9816 - loss: 0.0476 - accuracy: 0.98 - ETA
Epoch 76/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.0528 -
accuracy: 0.9854TA: 0s - loss: 0.0526 - accuracy
Epoch 77/200
6728/6728 [==============================] - 3s 409us/sample - loss: 0.0536 -
accuracy: 0.9807 - ETA: 0s - loss: 0.0508
Epoch 78/200
6728/6728 [==============================] - 3s 441us/sample - loss: 0.0680 -
accuracy: 0.9773
Epoch 79/200
6728/6728 [==============================] - 3s 422us/sample - loss: 0.0566 -
accuracy: 0.9811
Epoch 80/200
6728/6728 [==============================] - 3s 414us/sample - loss: 0.0464 -
accuracy: 0.9847
Epoch 81/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.0534 -
accuracy: 0.9835

27
Epoch 82/200
6728/6728 [==============================] - 3s 413us/sample - loss: 0.0485 -
accuracy: 0.9839
Epoch 83/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0610 -
accuracy: 0.9796 - loss: 0.0614 - accuracy: 0.
Epoch 84/200
6728/6728 [==============================] - 3s 394us/sample - loss: 0.0471 -
accuracy: 0.9845
Epoch 85/200
6728/6728 [==============================] - 3s 388us/sample - loss: 0.0650 -
accuracy: 0.9795
Epoch 86/200
6728/6728 [==============================] - 3s 391us/sample - loss: 0.0589 -
accuracy: 0.9804
Epoch 87/200
6728/6728 [==============================] - 3s 386us/sample - loss: 0.0483 -
accuracy: 0.9851 - loss: 0.0481 - accuracy:
Epoch 88/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0560 -
accuracy: 0.9819 - los - ETA: 0s - loss: 0.0560 - accuracy:
Epoch 89/200
6728/6728 [==============================] - 3s 438us/sample - loss: 0.0460 -
accuracy: 0.9874
Epoch 90/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0511 -
accuracy: 0.9859
Epoch 91/200
6728/6728 [==============================] - 3s 469us/sample - loss: 0.0611 -
accuracy: 0.9816
Epoch 92/200
6728/6728 [==============================] - 5s 679us/sample - loss: 0.0345 -
accuracy: 0.9897
Epoch 93/200
6728/6728 [==============================] - 3s 508us/sample - loss: 0.0365 -
accuracy: 0.9877
Epoch 94/200
6728/6728 [==============================] - 3s 513us/sample - loss: 0.0543 -
accuracy: 0.9823 - loss: 0.0510
Epoch 95/200
6728/6728 [==============================] - 3s 402us/sample - loss: 0.0538 -
accuracy: 0.9838 - loss: 0.0536 - accura
Epoch 96/200
6728/6728 [==============================] - 3s 417us/sample - loss: 0.0376 -
accuracy: 0.9878 -
Epoch 97/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0515 -
accuracy: 0.9841

28
Epoch 98/200
6728/6728 [==============================] - 3s 393us/sample - loss: 0.0460 -
accuracy: 0.9863 ETA: 0s - loss: 0.0455 - accuracy: 0.98
Epoch 99/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0396 -
accuracy: 0.9872 - loss: 0.0391 - accuracy: 0.
Epoch 100/200
6728/6728 [==============================] - 3s 392us/sample - loss: 0.0472 -
accuracy: 0.9866 - loss: 0
Epoch 101/200
6728/6728 [==============================] - 3s 388us/sample - loss: 0.0507 -
accuracy: 0.9845
Epoch 102/200
6728/6728 [==============================] - 3s 389us/sample - loss: 0.0393 -
accuracy: 0.9877 - loss: 0.0334
Epoch 103/200
6728/6728 [==============================] - 3s 404us/sample - loss: 0.0324 -
accuracy: 0.9911
Epoch 104/200
6728/6728 [==============================] - 3s 390us/sample - loss: 0.0532 -
accuracy: 0.9828
Epoch 105/200
6728/6728 [==============================] - 3s 393us/sample - loss: 0.0432 -
accuracy: 0.9865 - loss: 0.0410 - accu
Epoch 106/200
6728/6728 [==============================] - 3s 379us/sample - loss: 0.0435 -
accuracy: 0.9868 - loss: 0
Epoch 107/200
6728/6728 [==============================] - 3s 400us/sample - loss: 0.0386 -
accuracy: 0.9871
Epoch 108/200
6728/6728 [==============================] - 3s 396us/sample - loss: 0.0452 -
accuracy: 0.9866
Epoch 109/200
6728/6728 [==============================] - 3s 397us/sample - loss: 0.0392 -
accuracy: 0.9869 - loss: 0.0250 -
Epoch 110/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0470 -
accuracy: 0.9845 - loss: - ETA: 0s - loss: 0.0524 - accuracy - ETA: 0s - l
Epoch 111/200
6728/6728 [==============================] - 3s 427us/sample - loss: 0.0421 -
accuracy: 0.9874
Epoch 112/200
6728/6728 [==============================] - 3s 413us/sample - loss: 0.0350 -
accuracy: 0.9877 - - ETA: 1s - loss: 0.0 -
Epoch 113/200
6728/6728 [==============================] - 3s 444us/sample - loss: 0.0289 -
accuracy: 0.9909

29
Epoch 114/200
6728/6728 [==============================] - 6s 843us/sample - loss: 0.0405 -
accuracy: 0.9871
Epoch 115/200
6728/6728 [==============================] - 4s 529us/sample - loss: 0.0599 -
accuracy: 0.9817
Epoch 116/200
6728/6728 [==============================] - 3s 410us/sample - loss: 0.0231 -
accuracy: 0.9933
Epoch 117/200
6728/6728 [==============================] - 3s 390us/sample - loss: 0.0359 -
accuracy: 0.9884
Epoch 118/200
6728/6728 [==============================] - 3s 393us/sample - loss: 0.0589 -
accuracy: 0.9813
Epoch 119/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.0389 -
accuracy: 0.9884 - loss: - ETA: 0s -
Epoch 120/200
6728/6728 [==============================] - 4s 555us/sample - loss: 0.0322 -
accuracy: 0.9893 - l
Epoch 121/200
6728/6728 [==============================] - 3s 503us/sample - loss: 0.0380 -
accuracy: 0.9887TA: 1s - loss: 0.0323 - accu
Epoch 122/200
6728/6728 [==============================] - 3s 449us/sample - loss: 0.0343 -
accuracy: 0.9886
Epoch 123/200
6728/6728 [==============================] - 3s 430us/sample - loss: 0.0319 -
accuracy: 0.9903 - loss: 0.0240 - ac - E - ETA - ETA: 0s - loss: 0.0291 - accu
Epoch 124/200
6728/6728 [==============================] - 6s 957us/sample - loss: 0.0414 -
accuracy: 0.9862
Epoch 125/200
6728/6728 [==============================] - 3s 444us/sample - loss: 0.0346 -
accuracy: 0.9877
Epoch 126/200
6728/6728 [==============================] - 3s 458us/sample - loss: 0.0335 -
accuracy: 0.9883
Epoch 127/200
6728/6728 [==============================] - 3s 445us/sample - loss: 0.0304 -
accuracy: 0.9903: 0s - loss: 0.0278 - - ETA: 0s - loss: 0.0280 - ac
Epoch 128/200
6728/6728 [==============================] - 3s 486us/sample - loss: 0.0340 -
accuracy: 0.9897
Epoch 129/200
6728/6728 [==============================] - 3s 463us/sample - loss: 0.0468 -
accuracy: 0.9859

30
Epoch 130/200
6728/6728 [==============================] - 3s 414us/sample - loss: 0.0419 -
accuracy: 0.9872
Epoch 131/200
6728/6728 [==============================] - 4s 525us/sample - loss: 0.0316 -
accuracy: 0.9903
Epoch 132/200
6728/6728 [==============================] - 4s 621us/sample - loss: 0.0296 -
accuracy: 0.9914
Epoch 133/200
6728/6728 [==============================] - 5s 750us/sample - loss: 0.0264 -
accuracy: 0.9921
Epoch 134/200
6728/6728 [==============================] - 5s 775us/sample - loss: 0.0369 -
accuracy: 0.9897
Epoch 135/200
6728/6728 [==============================] - 5s 790us/sample - loss: 0.0464 -
accuracy: 0.9865 - loss: 0.0
Epoch 136/200
6728/6728 [==============================] - 5s 696us/sample - loss: 0.0298 -
accuracy: 0.9914
Epoch 137/200
6728/6728 [==============================] - 5s 788us/sample - loss: 0.0273 -
accuracy: 0.9905
Epoch 138/200
6728/6728 [==============================] - 4s 616us/sample - loss: 0.0337 -
accuracy: 0.9893 - loss: 0.0333 - accu
Epoch 139/200
6728/6728 [==============================] - 5s 757us/sample - loss: 0.0407 -
accuracy: 0.9878 - loss: 0.0424 - ac
Epoch 140/200
6728/6728 [==============================] - 3s 404us/sample - loss: 0.0387 -
accuracy: 0.9889:
Epoch 141/200
6728/6728 [==============================] - 3s 390us/sample - loss: 0.0297 -
accuracy: 0.9923
Epoch 142/200
6728/6728 [==============================] - 5s 723us/sample - loss: 0.0249 -
accuracy: 0.9918
Epoch 143/200
6728/6728 [==============================] - 5s 783us/sample - loss: 0.0184 -
accuracy: 0.9948
Epoch 144/200
6728/6728 [==============================] - 5s 754us/sample - loss: 0.0433 -
accuracy: 0.9856
Epoch 145/200
6728/6728 [==============================] - 5s 730us/sample - loss: 0.0369 -
accuracy: 0.9877

31
Epoch 146/200
6728/6728 [==============================] - 5s 734us/sample - loss: 0.0209 -
accuracy: 0.9941
Epoch 147/200
6728/6728 [==============================] - 4s 634us/sample - loss: 0.0335 -
accuracy: 0.9894
Epoch 148/200
6728/6728 [==============================] - 2s 357us/sample - loss: 0.0444 -
accuracy: 0.9860
Epoch 149/200
6728/6728 [==============================] - 5s 696us/sample - loss: 0.0321 -
accuracy: 0.9924
Epoch 150/200
6728/6728 [==============================] - 3s 429us/sample - loss: 0.0264 -
accuracy: 0.9917 - - ETA: - ETA: 0s - loss:
Epoch 151/200
6728/6728 [==============================] - 3s 472us/sample - loss: 0.0329 -
accuracy: 0.9893 - loss: 0.0302 - accura
Epoch 152/200
6728/6728 [==============================] - 6s 923us/sample - loss: 0.0391 -
accuracy: 0.9891
Epoch 153/200
6728/6728 [==============================] - 6s 842us/sample - loss: 0.0222 -
accuracy: 0.9941
Epoch 154/200
6728/6728 [==============================] - 5s 777us/sample - loss: 0.0229 -
accuracy: 0.9929
Epoch 155/200
6728/6728 [==============================] - 3s 393us/sample - loss: 0.0405 -
accuracy: 0.9896
Epoch 156/200
6728/6728 [==============================] - 5s 694us/sample - loss: 0.0355 -
accuracy: 0.9890
Epoch 157/200
6728/6728 [==============================] - 5s 692us/sample - loss: 0.0293 -
accuracy: 0.9918
Epoch 158/200
6728/6728 [==============================] - 8s 1ms/sample - loss: 0.0207 -
accuracy: 0.99420s - loss: 0.021
Epoch 159/200
6728/6728 [==============================] - 3s 490us/sample - loss: 0.0352 -
accuracy: 0.9890
Epoch 160/200
6728/6728 [==============================] - 3s 407us/sample - loss: 0.0393 -
accuracy: 0.9889 - l
Epoch 161/200
6728/6728 [==============================] - 3s 377us/sample - loss: 0.0141 -
accuracy: 0.9958 - loss: 0.0

32
Epoch 162/200
6728/6728 [==============================] - 3s 421us/sample - loss: 0.0296 -
accuracy: 0.9902
Epoch 163/200
6728/6728 [==============================] - 2s 354us/sample - loss: 0.0278 -
accuracy: 0.9915 - loss: 0.0200 - accuracy: 0. - ETA: 0s - loss: 0.0249 - ac
Epoch 164/200
6728/6728 [==============================] - 6s 917us/sample - loss: 0.0437 -
accuracy: 0.9880
Epoch 165/200
6728/6728 [==============================] - 3s 509us/sample - loss: 0.0273 -
accuracy: 0.9914
Epoch 166/200
6728/6728 [==============================] - 3s 487us/sample - loss: 0.0166 -
accuracy: 0.9951
Epoch 167/200
6728/6728 [==============================] - 3s 480us/sample - loss: 0.0273 -
accuracy: 0.9911
Epoch 168/200
6728/6728 [==============================] - 3s 468us/sample - loss: 0.0303 -
accuracy: 0.9902
Epoch 169/200
6728/6728 [==============================] - 3s 497us/sample - loss: 0.0243 -
accuracy: 0.9923
Epoch 170/200
6728/6728 [==============================] - 3s 446us/sample - loss: 0.0326 -
accuracy: 0.9899
Epoch 171/200
6728/6728 [==============================] - 3s 510us/sample - loss: 0.0355 -
accuracy: 0.9902
Epoch 172/200
6728/6728 [==============================] - 4s 589us/sample - loss: 0.0270 -
accuracy: 0.9917
Epoch 173/200
6728/6728 [==============================] - 4s 522us/sample - loss: 0.0231 -
accuracy: 0.9932
Epoch 174/200
6728/6728 [==============================] - 4s 629us/sample - loss: 0.0221 -
accuracy: 0.9939
Epoch 175/200
6728/6728 [==============================] - 3s 470us/sample - loss: 0.0263 -
accuracy: 0.9924
Epoch 176/200
6728/6728 [==============================] - 3s 499us/sample - loss: 0.0223 -
accuracy: 0.9938
Epoch 177/200
6728/6728 [==============================] - 5s 672us/sample - loss: 0.0689 -
accuracy: 0.9884

33
Epoch 178/200
6728/6728 [==============================] - 3s 431us/sample - loss: 0.0357 -
accuracy: 0.9893
Epoch 179/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0160 -
accuracy: 0.9975 - loss: 0.0167 - accura
Epoch 180/200
6728/6728 [==============================] - 3s 395us/sample - loss: 0.0066 -
accuracy: 0.9990
Epoch 181/200
6728/6728 [==============================] - 3s 407us/sample - loss: 0.0363 -
accuracy: 0.9899
Epoch 182/200
6728/6728 [==============================] - 3s 392us/sample - loss: 0.0403 -
accuracy: 0.9871
Epoch 183/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0154 -
accuracy: 0.9957
Epoch 184/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0225 -
accuracy: 0.9933
Epoch 185/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.0240 -
accuracy: 0.9920
Epoch 186/200
6728/6728 [==============================] - 3s 398us/sample - loss: 0.0260 -
accuracy: 0.9927
Epoch 187/200
6728/6728 [==============================] - 3s 400us/sample - loss: 0.0404 -
accuracy: 0.9896
Epoch 188/200
6728/6728 [==============================] - 3s 399us/sample - loss: 0.0164 -
accuracy: 0.9957
Epoch 189/200
6728/6728 [==============================] - 3s 407us/sample - loss: 0.0233 -
accuracy: 0.9930
Epoch 190/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0288 -
accuracy: 0.9906
Epoch 191/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0374 -
accuracy: 0.9902
Epoch 192/200
6728/6728 [==============================] - 3s 394us/sample - loss: 0.0162 -
accuracy: 0.9948
Epoch 193/200
6728/6728 [==============================] - 3s 395us/sample - loss: 0.0153 -
accuracy: 0.9957

34
Epoch 194/200
6728/6728 [==============================] - 3s 403us/sample - loss: 0.0129 -
accuracy: 0.9961
Epoch 195/200
6728/6728 [==============================] - 3s 401us/sample - loss: 0.0460 -
accuracy: 0.9857
Epoch 196/200
6728/6728 [==============================] - 3s 440us/sample - loss: 0.0276 -
accuracy: 0.9927
Epoch 197/200
6728/6728 [==============================] - 3s 389us/sample - loss: 0.0212 -
accuracy: 0.9944
Epoch 198/200
6728/6728 [==============================] - 3s 410us/sample - loss: 0.0215 -
accuracy: 0.9933
Epoch 199/200
6728/6728 [==============================] - 3s 399us/sample - loss: 0.0295 -
accuracy: 0.9912
Epoch 200/200
6728/6728 [==============================] - 3s 400us/sample - loss: 0.0173 -
accuracy: 0.9958

[54]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=<keras.wrappers.scikit_learn.KerasClassifier object at
0x1a42442bd0>,
iid='deprecated', n_jobs=-1,
param_grid={'batch_size': [5, 10, 20], 'epochs': [100, 200, 400],
'optimizer': ['rmsprop', 'adam']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[55]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.6248637939574045


Best Params: {'batch_size': 5, 'epochs': 200, 'optimizer': 'adam'}
Best Estimator: <keras.wrappers.scikit_learn.KerasClassifier object at
0x1126baa50>

[56]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

nn_grid= grid.best_estimator_

35
y_pred = nn_grid.predict(X_test)

[57]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[57]: Predicted A Predicted T


Actual A 1012 613
Actual T 462 797

[58]: # Calculating metrics

nn_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', nn_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.6272538141470181


Classification Report:
precision recall f1-score support

0 0.69 0.62 0.65 1625


1 0.57 0.63 0.60 1259

accuracy 0.63 2884


macro avg 0.63 0.63 0.63 2884
weighted avg 0.63 0.63 0.63 2884

1.4.8 Gradient Boosting Machines - XGBoost


[63]: # Building our model with K-fold validation and GridSearch to find the best␣
↪parameters

# Defining all the parameters


params = {
'max_depth': range (2, 10, 1),
'n_estimators': range(60, 220, 40),
'learning_rate': [0.1, 0.01, 0.05]
}

# Building model
xgb = XGBClassifier(objective='binary:logistic')

# Parameter estimating using GridSearch


grid = GridSearchCV(xgb, param_grid=params, scoring='accuracy', n_jobs =-1,␣
↪cv=cv, verbose=1)

36
# Fitting the model
grid.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 6.8min
[Parallel(n_jobs=-1)]: Done 442 tasks | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 792 tasks | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 40.8min finished

[63]: GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=15,


test_size=0.3,
train_size=None),
error_score=nan,
estimator=XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1,
missing=None, n_estimators=100, n_jobs=1,
nthread=N…bjective='binary:logistic',
random_state=0, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1),
iid='deprecated', n_jobs=-1,
param_grid={'learning_rate': [0.1, 0.01, 0.05],
'max_depth': range(2, 10),
'n_estimators': range(60, 220, 40)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=1)

[64]: print('Best Score:', grid.best_score_)


print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

Best Score: 0.7132738979692917


Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 180}
Best Estimator: XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=180, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)

37
[65]: # Using the best parameters from the grid-search and predicting on test feature␣
↪dataset(X_test)

xgb_grid= grid.best_estimator_
y_pred = xgb_grid.predict(X_test)

[66]: # Confusion matrix

pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A",␣


↪"Predicted T"], index=["Actual A","Actual T"] )

[66]: Predicted A Predicted T


Actual A 1446 179
Actual T 632 627

[67]: # Calculating metrics

xgb_grid_score = accuracy_score(y_test, y_pred)


print('Model Accuracy:', xgb_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

Model Accuracy: 0.7187933425797504


Classification Report:
precision recall f1-score support

0 0.70 0.89 0.78 1625


1 0.78 0.50 0.61 1259

accuracy 0.72 2884


macro avg 0.74 0.69 0.69 2884
weighted avg 0.73 0.72 0.71 2884

1.5 RESULTS
[73]: score_df = pd.DataFrame(
[
['Logistic Regression', logreg_grid_score, 0.6857355126300149],
['K-Nearest Neighbors', knn_grid_score, 0.6381376919266962],
['Gaussian Naïve Bayes', gb_grid_score, 0.5548291233283804],
['Support Vector Machines', svm_grid_score, 0.6701337295690937],
['Decision Tree Classifier', dtc_grid_score, 0.6505695889053987],
['Random Forest Tree Classifier', rfc_grid_score, 0.7094105993065873],
['Artificial Neural Networks', nn_grid_score, 0.6248637939574045],
['GBM - XGBoost', xgb_grid_score, 0.7132738979692917],
],
columns= ['Model', 'Test Score', 'Validation Score']

38
)
score_df['Test Score'] = score_df['Test Score']*100
score_df['Validation Score'] = score_df['Validation Score']*100

[74]: score_df

[74]: Model Test Score Validation Score


0 Logistic Regression 70.423024 68.573551
1 K-Nearest Neighbors 64.979196 63.813769
2 Gaussian Naïve Bayes 65.846047 55.482912
3 Support Vector Machines 67.926491 67.013373
4 Decision Tree Classifier 64.701803 65.056959
5 Random Forest Tree Classifier 71.879334 70.941060
6 Artificial Neural Networks 62.725381 62.486379
7 GBM - XGBoost 71.879334 71.327390

[152]: fig, ax1 = plt.subplots(figsize=(10, 5))


tidy = score_df.melt(id_vars='Model').rename(columns=str.title)
sns.barplot(x='Model', y='Value', hue='Variable', data=tidy, ax=ax1,␣
↪palette=sns.xkcd_palette(["azure", "light red"]))

plt.ylim(20, 90)
plt.xticks(rotation=45, horizontalalignment="right")
plt.savefig('./plots/result.png')
sns.despine(fig)

39
[128]: time_df = pd.DataFrame(
[
['Logistic Regression', 1.2],
['K-Nearest Neighbors', 1.0],
['Gaussian Naïve Bayes', 0.0034],
['Support Vector Machines', 51.7],
['Decision Tree Classifier', 0.068],
['Random Forest Tree Classifier', 15.1],
['Artificial Neural Networks', 454.2],
['GBM - XGBoost', 40.8],
],
columns= ['Model', 'Training Time']
)

[153]: fig, ax1 = plt.subplots(figsize=(10, 5))


sns.barplot(data=time_df, x='Model', y='Training Time', palette=sns.
↪color_palette('husl'))

plt.xticks(rotation=45, horizontalalignment="right")
plt.ylabel('Training Time(in mins)')
plt.savefig('./plots/training_time.png')
sns.despine(fig)

40
41

You might also like