0% found this document useful (0 votes)
4 views29 pages

Print Version

Uploaded by

tamannatonny07
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views29 pages

Print Version

Uploaded by

tamannatonny07
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 29

4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.

ipynb - Colab

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from scipy import stats

# Load data
df = pd.read_csv('/content/dataset.csv')

# Basic info and cleaning


print(df.info())
print(df.head())
df.isna().sum() # Check for missing values - you'll need to handle these
df.duplicated().sum() # Check for duplicate rows

# Label encoding the 'Target' column


df['Target'] = LabelEncoder().fit_transform(df['Target'])

# Identifying and removing outliers


z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
outliers = np.where(z_scores > 3)
df = df.drop(df.index[outliers[0]]).reset_index(drop=True)

# Drop rows where 'Target' is 'Enrolled'


df = df[df['Target'] != 1]

# Mapping integer codes to new values: Dropout or Graduate


mapping = {0: 1, 2: 0}
df['Target'] = df['Target'].replace(mapping)

# Split data into features and target


X = df.drop(columns=['Target'], axis=1)
y = df['Target']

# Standardize numeric features


numeric_features = X.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Encode categorical variables (optional step based on model needs)


categorical_features = X.select_dtypes(include=['object']).columns
for feature in categorical_features:
le = LabelEncoder()
X[feature] = le.fit_transform(X[feature])

# Split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

output
( pp )
23 Curricular units 1st sem (grade) 4424 non-null float64
24 Curricular units 1st sem (without evaluations) 4424 non-null int64
25 Curricular units 2nd sem (credited) 4424 non-null int64
26 Curricular units 2nd sem (enrolled) 4424 non-null int64
27 Curricular units 2nd sem (evaluations) 4424 non-null int64
28 Curricular units 2nd sem (approved) 4424 non-null int64
29 Curricular units 2nd sem (grade) 4424 non-null float64

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 1/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

Curricular units 2nd sem (credited) Curricular units 2nd sem (enrolled) \
0 0 0
1 0 6
2 0 6
3 0 6
4 0 6

Curricular units 2nd sem (evaluations) \


0 0
1 6
2 0
3 10
4 6

Curricular units 2nd sem (approved) Curricular units 2nd sem (grade) \
0 0 0.000000
1 6 13.666667
2 0 0.000000
3 5 12.400000
4 6 13.000000

Curricular units 2nd sem (without evaluations) Unemployment rate \

# DECISION TREE BASIC


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize and train the Decision Tree model


dt_model = DecisionTreeClassifier(random_state=10)
dt_model.fit(X_train, y_train)

# Evaluate the model


y_pred_dt = dt_model.predict(X_test)
print('Classification Report for Decision Tree:')
print(classification_report(y_test, y_pred_dt))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred_dt))

Classification Report for Decision Tree:


precision recall f1-score support

0 0.89 0.93 0.91 353


1 0.88 0.83 0.86 229

accuracy 0.89 582


macro avg 0.89 0.88 0.88 582
weighted avg 0.89 0.89 0.89 582

Confusion Matrix: [[328 25]


[ 39 190]]

# RANDOM FOREST CLASSIFIER


from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model


rf_model = RandomForestClassifier(n_estimators=100, random_state=10)
rf_model.fit(X_train, y_train)

# Evaluate the model


y_pred_rf = rf_model.predict(X_test)
print('Classification Report for Random Forest:')
print(classification_report(y_test, y_pred_rf))

Classification Report for Random Forest:


precision recall f1-score support

0 0.90 0.99 0.94 353


1 0.98 0.83 0.90 229

accuracy 0.93 582


macro avg 0.94 0.91 0.92 582
weighted avg 0.93 0.93 0.93 582

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 2/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# GRADIENT BOOSTING MACHINES (GBM)

from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the GBM model


gbm_model = GradientBoostingClassifier(random_state=10)
gbm_model.fit(X_train, y_train)

# Evaluate the model


y_pred_gbm = gbm_model.predict(X_test)
print('Classification Report for Gradient Boosting Machine:')
print(classification_report(y_test, y_pred_gbm))

Classification Report for Gradient Boosting Machine:


precision recall f1-score support

0 0.89 0.98 0.94 353


1 0.97 0.82 0.89 229

accuracy 0.92 582


macro avg 0.93 0.90 0.91 582
weighted avg 0.92 0.92 0.92 582

# XGB BOOST CLASSIFIER

import xgboost as xgb

# Initialize and train the XGBoost model


xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Evaluate the model


y_pred_xgb = xgb_model.predict(X_test)
print('Classification Report for XGBoost:')
print(classification_report(y_test, y_pred_xgb))

Classification Report for XGBoost:


precision recall f1-score support

0 0.90 0.98 0.93 353


1 0.96 0.83 0.89 229

accuracy 0.92 582


macro avg 0.93 0.90 0.91 582
weighted avg 0.92 0.92 0.92 582

# LIGHTGBM CLASSIFIER
import lightgbm as lgb

# Initialize and train the LightGBM model


lgbm_model = lgb.LGBMClassifier(random_state=10)
lgbm_model.fit(X_train, y_train)

# Evaluate the model


y_pred_lgbm = lgbm_model.predict(X_test)
print('Classification Report for LightGBM:')
print(classification_report(y_test, y_pred_lgbm))

[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines


[LightGBM] [Info] Number of positive: 887, number of negative: 1438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 818
[LightGBM] [Info] Number of data points in the train set: 2325, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381505 -> initscore=-0.483164
[LightGBM] [Info] Start training from score -0.483164
Classification Report for LightGBM:
precision recall f1-score support

0 0.90 0.99 0.94 353


1 0.97 0.84 0.90 229

accuracy 0.93 582


macro avg 0.94 0.91 0.92 582
weighted avg 0.93 0.93 0.93 582

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 3/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# ADABOOST CLASSIFIER

from sklearn.ensemble import AdaBoostClassifier

# Initialize and train the AdaBoost model


ada_model = AdaBoostClassifier(random_state=10)
ada_model.fit(X_train, y_train)

# Evaluate the model


y_pred_ada = ada_model.predict(X_test)
print('Classification Report for AdaBoost:')
print(classification_report(y_test, y_pred_ada))

Classification Report for AdaBoost:


precision recall f1-score support

0 0.90 0.98 0.94 353


1 0.96 0.83 0.89 229

accuracy 0.92 582


macro avg 0.93 0.90 0.91 582
weighted avg 0.92 0.92 0.92 582

# Support Vector Machine (Non-Linear)

from sklearn.svm import SVC

# Initialize and train a non-linear SVM model


svm_nl_model = SVC(kernel='rbf')
svm_nl_model.fit(X_train, y_train)

# Evaluate the model


y_pred_svm_nl = svm_nl_model.predict(X_test)
print('Classification Report for Non-Linear SVM:')
print(classification_report(y_test, y_pred_svm_nl))

Classification Report for Non-Linear SVM:


precision recall f1-score support

0 0.89 0.98 0.94 353


1 0.97 0.82 0.89 229

accuracy 0.92 582


macro avg 0.93 0.90 0.91 582
weighted avg 0.92 0.92 0.92 582

# NEURAL NETWORKS (SIMPLE & DEEP)

from sklearn.neural_network import MLPClassifier

# Initialize and train a simple neural network model


nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=10)
nn_model.fit(X_train, y_train)

# Evaluate the model


y_pred_nn = nn_model.predict(X_test)
print('Classification Report for Neural Network:')
print(classification_report(y_test, y_pred_nn))

Classification Report for Neural Network:


precision recall f1-score support

0 0.90 0.96 0.93 353


1 0.94 0.84 0.88 229

accuracy 0.91 582


macro avg 0.92 0.90 0.91 582
weighted avg 0.92 0.91 0.91 582

/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha


warnings.warn(

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 4/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# K-Nearest Neighbors (KNN)

from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the KNN model


knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Evaluate the model


y_pred_knn = knn_model.predict(X_test)
print('Classification Report for KNN:')
print(classification_report(y_test, y_pred_knn))

Classification Report for KNN:


precision recall f1-score support

0 0.83 0.98 0.90 353


1 0.96 0.69 0.81 229

accuracy 0.87 582


macro avg 0.90 0.84 0.85 582
weighted avg 0.88 0.87 0.86 582

# Logistic Regression with L2 Regularization

# Import necessary classes


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the Logistic Regression model with L2 regularization


logreg_l2 = LogisticRegression(penalty='l2')
logreg_l2.fit(X_train, y_train)

# Evaluate the model


y_pred_logreg_l2 = logreg_l2.predict(X_test)
print('Classification Report for Logistic Regression with L2 Regularization:')
print(classification_report(y_test, y_pred_logreg_l2))

Classification Report for Logistic Regression with L2 Regularization:


precision recall f1-score support

0 0.91 0.98 0.94 353


1 0.96 0.84 0.90 229

accuracy 0.93 582


macro avg 0.94 0.91 0.92 582
weighted avg 0.93 0.93 0.93 582

Start coding or generate with AI.

keyboard_arrow_down VERSION 2 - GRIDSEARCH TO FIND BEST HYPER PARAMETER


from sklearn.model_selection import GridSearchCV

# gridsearch for decision tree

dt_params = {
'max_depth': [3, 5, 10, 20, None],
'min_samples_leaf': [1, 2, 4, 6],
'min_samples_split': [2, 5, 10]
}
grid_search_dt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=10), param_grid=dt_params, cv=5, scoring='accura
grid_search_dt.fit(X_train, y_train)
print("Best Decision Tree Parameters:", grid_search_dt.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


Best Decision Tree Parameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 5/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# gridsearch for random forest

rf_params = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 20, None],
'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=10), param_grid=rf_params, cv=5, scoring='accura
grid_search_rf.fit(X_train, y_train)
print("Best Random Forest Parameters:", grid_search_rf.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Best Random Forest Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 200}

# grid search for Gradient Boosting Machines

gbm_params = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 10]
}
grid_search_gbm = GridSearchCV(estimator=GradientBoostingClassifier(random_state=10), param_grid=gbm_params, cv=5, scoring='
grid_search_gbm.fit(X_train, y_train)
print("Best GBM Parameters:", grid_search_gbm.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Best GBM Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

# grid search for XGBoost

xgb_params = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7]
}
grid_search_xgb = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), param_grid=xgb_
grid_search_xgb.fit(X_train, y_train)
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Best XGBoost Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

# grid search for LightGBM

lgbm_params = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'num_leaves': [31, 41, 61]
}
grid_search_lgbm = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=lgbm_params, cv=5, scoring='accuracy', verbose=1)
grid_search_lgbm.fit(X_train, y_train)
print("Best LightGBM Parameters:", grid_search_lgbm.best_params_)

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 6/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
[LightGBM] [Info] Start training from score -0.484531
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 709, number of negative: 1151
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 815
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381183 -> initscore=-0.484531
[LightGBM] [Info] Start training from score -0.484531
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 710, number of negative: 1150
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 813
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381720 -> initscore=-0.482252
[LightGBM] [Info] Start training from score -0.482252
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 710, number of negative: 1150
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 813
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381720 -> initscore=-0.482252
[LightGBM] [Info] Start training from score -0.482252
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Number of positive: 710, number of negative: 1150
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

# gridsearch for AdaBoost

ada_params = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.5, 1.0, 1.5]
}
grid_search_ada = GridSearchCV(estimator=AdaBoostClassifier(random_state=10), param_grid=ada_params, cv=5, scoring='accuracy
grid_search_ada.fit(X_train, y_train)
print("Best AdaBoost Parameters:", grid_search_ada.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Best AdaBoost Parameters: {'learning_rate': 0.5, 'n_estimators': 50}

# gridsearch for SVM

svm_params = {
'C': [0.1, 1, 10],
'gamma': ['scale', 'auto'],
'kernel': ['rbf', 'poly']
}
grid_search_svm = GridSearchCV(estimator=SVC(random_state=10), param_grid=svm_params, cv=5, scoring='accuracy', verbose=1)
grid_search_svm.fit(X_train, y_train)
print("Best SVM Parameters:", grid_search_svm.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Best SVM Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

# grid search for Neural Networks

nn_params = {
'hidden_layer_sizes': [(50,), (100,), (100, 50)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'max_iter': [200, 300]
}
grid_search_nn = GridSearchCV(estimator=MLPClassifier(random_state=10), param_grid=nn_params, cv=5, scoring='accuracy', verb
grid_search_nn.fit(X_train, y_train)
print("Best Neural Network Parameters:", grid_search_nn.best_params_)

/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha


warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 7/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
/usr/local/lib/python3.10/dist packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stocha
warnings.warn(

# grid search for K-Nearest Neighbors (KNN)

knn_params = {
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
grid_search_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_params, cv=5, scoring='accuracy', verbose=1)
grid_search_knn.fit(X_train, y_train)
print("Best KNN Parameters:", grid_search_knn.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


Best KNN Parameters: {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}

# grid search for Logistic Regression with Regularization

logreg_params = {
'C': [0.01, 0.1, 1, 10],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']
}
grid_search_logreg = GridSearchCV(estimator=LogisticRegression(), param_grid=logreg_params, cv=5, scoring='accuracy', verbos
grid_search_logreg.fit(X_train, y_train)
print("Best Logistic Regression Parameters:", grid_search_logreg.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 8/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(
Best Logistic Regression Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached w
warnings.warn(

Start coding or generate with AI.

keyboard_arrow_down IMPLEMENTATION OF BEST MODELS AND THEN COMPARING

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 9/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Initialize models with best parameters


models = {
'Decision Tree': DecisionTreeClassifier(max_depth=3, min_samples_leaf=1, min_samples_split=2),
'Random Forest': RandomForestClassifier(max_depth=20, min_samples_leaf=1, n_estimators=200),
'GBM': GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=100),
'XGBoost': xgb.XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=200, use_label_encoder=False, eval_metric='mlo
'LightGBM': lgb.LGBMClassifier(learning_rate=0.1, n_estimators=200, num_leaves=41),
'AdaBoost': AdaBoostClassifier(learning_rate=0.5, n_estimators=50),
'SVM': SVC(C=1, gamma='scale', kernel='rbf', probability=True),
'Neural Network': MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 50), max_iter=200, solver='sgd'),
'KNN': KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform'),
'Logistic Regression': LogisticRegression(C=0.1, penalty='l1', solver='liblinear')
}

# Train and evaluate each model


model_metrics = []
for name, model in models.items():
model.fit(X_train, y_train)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred, output_dict=True)
f1 = report['weighted avg']['f1-score']
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
cm = confusion_matrix(y_test, y_pred)

model_metrics.append({
'Model': name,
'Mean CV Accuracy': np.mean(cv_scores),
'Precision': precision,
'Recall': recall,
'F1-Score': f1,
'ROC AUC': roc_auc,
'Confusion Matrix': cm
})

# Feature importance for tree-based models


if name in ['Random Forest', 'GBM', 'XGBoost', 'LightGBM']:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title(f"Feature Importances for {name}")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

# Compile results into a DataFrame


results_df = pd.DataFrame(model_metrics)
print(results_df[['Model', 'Mean CV Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']])

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 10/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 11/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

[LightGBM] [Warning] Found whitespace in feature_names, replace with underline


[LightGBM] [Info] Number of positive: 887, number of negative: 1438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 818
[LightGBM] [Info] Number of data points in the train set: 2325, number of used
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381505 -> initscore=-0.48316
[LightGBM] [Info] Start training from score -0.483164
[LightGBM] [Warning] Found whitespace in feature_names, replace with underline
[LightGBM] [Info] Number of positive: 892, number of negative: 1433
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 824
[LightGBM] [Info] Number of data points in the train set: 2325, number of used
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383656 -> initscore=-0.47405
[LightGBM] [Info] Start training from score -0.474059
[LightGBM] [Warning] Found whitespace in feature_names, replace with underline
[LightGBM] [Info] Number of positive: 893, number of negative: 1432
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 823
[LightGBM] [Info] Number of data points in the train set: 2325, number of used
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384086 -> initscore=-0.47224
[LightGBM] [Info] Start training from score -0.472241
[LightGBM] [Warning] Found whitespace in feature_names, replace with underline
[LightGBM] [Info] Number of positive: 893, number of negative: 1433
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 2326, number of used
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383921 -> initscore=-0.47293
[LightGBM] [Info] Start training from score -0.472939
[LightGBM] [Warning] Found whitespace in feature_names, replace with underline
[LightGBM] [Info] Number of positive: 893, number of negative: 1433
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 824
[LightGBM] [Info] Number of data points in the train set: 2326, number of used
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383921 -> initscore=-0.47293
[LightGBM] [Info] Start training from score -0.472939
[LightGBM] [Warning] Found whitespace in feature_names, replace with underline
[LightGBM] [Info] Number of positive: 893, number of negative: 1433
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 819
[LightGBM] [Info] Number of data points in the train set: 2326, number of used
[ ] [ ] [ ]
https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 12/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383921 -> initscore=-0.47293
[LightGBM] [Info] Start training from score -0.472939

/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_pe
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_pe
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_pe
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_pe
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_pe
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_pe
warnings.warn(
Model Mean CV Accuracy Precision Recall F1-Score \
0 Decision Tree 0.898865 0.904874 0.895189 0.892153
1 Random Forest 0.914008 0.928788 0.924399 0.923104
2 GBM 0.914697 0.923739 0.919244 0.917810
3 XGBoost 0.913320 0.923095 0.919244 0.917912
4 LightGBM 0.910223 0.928151 0.924399 0.923198
5 AdaBoost 0.910221 0.916883 0.914089 0.912829
6 SVM 0.917101 0.922281 0.917526 0.916009
7 Neural Network 0.915040 0.928552 0.926117 0.925163
8 KNN 0.868246 0.883877 0.869416 0.864237
9 Logistic Regression 0.917100 0.921618 0.917526 0.916114

ROC AUC
0 0.935605
1 0.966896
2 0.967651
3 0.964818
4 0.963024
5 0.963476
6 0.958831
7 0.965560
8 0.913326
9 0.965956

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 13/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
import matplotlib.pyplot as plt
import seaborn as sns

# Creating a DataFrame from the provided data


data = {
'Model': ['Decision Tree', 'Random Forest', 'GBM', 'XGBoost', 'LightGBM', 'AdaBoost', 'SVM', 'Neural Network', 'KNN', 'L
'Mean CV Accuracy': [0.898865, 0.913319, 0.914697, 0.913320, 0.910223, 0.910221, 0.917101, 0.919167, 0.868246, 0.917100]
'Precision': [0.904874, 0.925203, 0.923739, 0.923095, 0.928151, 0.916883, 0.922281, 0.931126, 0.883877, 0.921618],
'Recall': [0.895189, 0.920962, 0.919244, 0.919244, 0.924399, 0.914089, 0.917526, 0.927835, 0.869416, 0.917526],
'F1-Score': [0.892153, 0.919609, 0.917810, 0.917912, 0.923198, 0.912829, 0.916009, 0.926777, 0.864237, 0.916114],
'ROC AUC': [0.935605, 0.967057, 0.967478, 0.964818, 0.963024, 0.963476, 0.958831, 0.969816, 0.913326, 0.965931]
}

df = pd.DataFrame(data)
df_melted = df.melt('Model', var_name='Metrics', value_name='Values')

plt.figure(figsize=(10, 8))
sns.barplot(x='Values', y='Model', hue='Metrics', data=df_melted)
plt.title('Performance Comparison of ML Models')
plt.xlabel('Score')
plt.ylabel('Model')
plt.legend(loc='lower right')
plt.show()

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 14/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Data preparation
models = ['Decision Tree', 'Random Forest', 'GBM', 'XGBoost', 'LightGBM', 'AdaBoost', 'SVM', 'Neural Network', 'KNN', 'Logis
mean_cv_accuracy = [0.898865, 0.913319, 0.914697, 0.913320, 0.910223, 0.910221, 0.917101, 0.919167, 0.868246, 0.917100]
precision = [0.904874, 0.925203, 0.923739, 0.923095, 0.928151, 0.916883, 0.922281, 0.931126, 0.883877, 0.921618]
recall = [0.895189, 0.920962, 0.919244, 0.919244, 0.924399, 0.914089, 0.917526, 0.927835, 0.869416, 0.917526]
f1_score = [0.892153, 0.919609, 0.917810, 0.917912, 0.923198, 0.912829, 0.916009, 0.926777, 0.864237, 0.916114]
roc_auc = [0.935605, 0.967057, 0.967478, 0.964818, 0.963024, 0.963476, 0.958831, 0.969816, 0.913326, 0.965931]

df = pd.DataFrame({
'Model': models,
'Mean CV Accuracy': mean_cv_accuracy,
'Precision': precision,
'Recall': recall,
'F1-Score': f1_score,
'ROC AUC': roc_auc
})

# Plotting
metrics = ['Mean CV Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']
for metric in metrics:
plt.figure(figsize=(10, 6))
sns.barplot(x=metric, y='Model', data=df, palette='viridis')
plt.title(f'Comparison of Models Based on {metric}')
plt.xlabel(metric)
plt.ylabel('Model')
plt.xlim(left=min(df[metric]) * 0.95, right=max(df[metric]) * 1.05) # Adjust x-axis limits for better visualization
plt.show()

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 15/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

<ipython-input-17-13aba1823bd1>:26: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in

sns.barplot(x=metric, y='Model', data=df, palette='viridis')

<ipython-input-17-13aba1823bd1>:26: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in

sns.barplot(x=metric, y='Model', data=df, palette='viridis')

<ipython-input-17-13aba1823bd1>:26: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in

sns.barplot(x=metric, y='Model', data=df, palette='viridis')

<ipython-input-17-13aba1823bd1>:26: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in

sns.barplot(x=metric, y='Model', data=df, palette='viridis')

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 16/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

<ipython-input-17-13aba1823bd1>:26: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in

sns.barplot(x=metric, y='Model', data=df, palette='viridis')

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 17/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

# Rank for 5 Metrics

import pandas as pd

# Initialize data
data = {
'Model': ['Decision Tree', 'Random Forest', 'GBM', 'XGBoost', 'LightGBM', 'AdaBoost', 'SVM', 'Neural Network', 'KNN', 'L
'Mean CV Accuracy': [0.898865, 0.913319, 0.914697, 0.913320, 0.910223, 0.910221, 0.917101, 0.919167, 0.868246, 0.917100]
'Precision': [0.904874, 0.925203, 0.923739, 0.923095, 0.928151, 0.916883, 0.922281, 0.931126, 0.883877, 0.921618],
'Recall': [0.895189, 0.920962, 0.919244, 0.919244, 0.924399, 0.914089, 0.917526, 0.927835, 0.869416, 0.917526],
'F1-Score': [0.892153, 0.919609, 0.917810, 0.917912, 0.923198, 0.912829, 0.916009, 0.926777, 0.864237, 0.916114],
'ROC AUC': [0.935605, 0.967057, 0.967478, 0.964818, 0.963024, 0.963476, 0.958831, 0.969816, 0.913326, 0.965931]
}
df = pd.DataFrame(data)

# Calculate ranks for each metric


for metric in ['Mean CV Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']:
df[f'{metric} Rank'] = df[metric].rank(ascending=False, method='min')

df_sorted = df.sort_values(by='Mean CV Accuracy Rank')


print(df_sorted)

Model Mean CV Accuracy Precision Recall F1-Score \


7 Neural Network 0.919167 0.931126 0.927835 0.926777
6 SVM 0.917101 0.922281 0.917526 0.916009
9 Logistic Regression 0.917100 0.921618 0.917526 0.916114
2 GBM 0.914697 0.923739 0.919244 0.917810
3 XGBoost 0.913320 0.923095 0.919244 0.917912
1 Random Forest 0.913319 0.925203 0.920962 0.919609
4 LightGBM 0.910223 0.928151 0.924399 0.923198
5 AdaBoost 0.910221 0.916883 0.914089 0.912829
0 Decision Tree 0.898865 0.904874 0.895189 0.892153
8 KNN 0.868246 0.883877 0.869416 0.864237

ROC AUC Mean CV Accuracy Rank Precision Rank Recall Rank \


7 0.969816 1.0 1.0 1.0
6 0.958831 2.0 6.0 6.0
9 0.965931 3.0 7.0 6.0
2 0.967478 4.0 4.0 4.0
3 0.964818 5.0 5.0 4.0
1 0.967057 6.0 3.0 3.0
4 0.963024 7.0 2.0 2.0
5 0.963476 8.0 8.0 8.0
0 0.935605 9.0 9.0 9.0
8 0.913326 10.0 10.0 10.0

F1-Score Rank ROC AUC Rank


7 1.0 1.0
6 7.0 8.0
9 6.0 4.0
2 5.0 2.0
3 4.0 5.0
1 3.0 3.0
4 2.0 7.0
5 8.0 6.0
0 9.0 9.0
8 10.0 10.0

# Top 3 in Each Category

top_3_each_metric = {}
metrics = ['Mean CV Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']
for metric in metrics:
top_3_each_metric[metric] = df.nsmallest(3, f'{metric} Rank')[['Model', metric]]
print(f"Top 3 for {metric}:\n{top_3_each_metric[metric]}\n")

Top 3 for Mean CV Accuracy:


Model Mean CV Accuracy
7 Neural Network 0.919167
6 SVM 0.917101
9 Logistic Regression 0.917100

Top 3 for Precision:


Model Precision
7 Neural Network 0.931126
4 LightGBM 0.928151
1 Random Forest 0.925203

Top 3 for Recall:

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 18/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
Model Recall
7 Neural Network 0.927835
4 LightGBM 0.924399
1 Random Forest 0.920962

Top 3 for F1-Score:


Model F1-Score
7 Neural Network 0.926777
4 LightGBM 0.923198
1 Random Forest 0.919609

Top 3 for ROC AUC:


Model ROC AUC
7 Neural Network 0.969816
2 GBM 0.967478
1 Random Forest 0.967057

# Overall Top 3

# Sum of ranks across all metrics


df['Sum of Ranks'] = df[[f'{metric} Rank' for metric in metrics]].sum(axis=1)
overall_top_3 = df.nsmallest(3, 'Sum of Ranks')[['Model', 'Sum of Ranks']]
print(f"Overall Top 3 Models:\n{overall_top_3}\n")

Overall Top 3 Models:


Model Sum of Ranks
7 Neural Network 5.0
1 Random Forest 18.0
2 GBM 19.0

# Overall Winner
overall_winner = df.loc[df['Sum of Ranks'].idxmin()]
print(f"Overall Winner:\n{overall_winner}\n")

Overall Winner:
Model Neural Network
Mean CV Accuracy 0.919167
Precision 0.931126
Recall 0.927835
F1-Score 0.926777
ROC AUC 0.969816
Mean CV Accuracy Rank 1.0
Precision Rank 1.0
Recall Rank 1.0
F1-Score Rank 1.0
ROC AUC Rank 1.0
Sum of Ranks 5.0
Name: 7, dtype: object

Start coding or generate with AI.

keyboard_arrow_down VISUALIZATIONS

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 19/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Function to plot ROC curve


def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal

# Plot ROC curve for each model


models = [dt_model, rf_model, gbm_model, xgb_model, lgbm_model, ada_model, svm_nl_model, nn_model, knn_model, logreg_l2]
model_names = ['Decision Tree', 'Random Forest', 'GBM', 'XGBoost', 'LightGBM', 'AdaBoost', 'SVM', 'Neural Network', 'KNN', '

for model, name in zip(models, model_names):


if hasattr(model, "predict_proba"):
y_scores = model.predict_proba(X_test)[:, 1]
else: # use decision function if predict_proba is not available
y_scores = model.decision_function(X_test)
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
plot_roc_curve(fpr, tpr, f'{name} (area = {roc_auc:.2f})')

plt.xlabel('False Positive Rate')


plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.show()

# CONFUSION MATRIX HEATMAP

from sklearn.metrics import confusion_matrix


import seaborn as sns

# Plot confusion matrix for each model


for model, name in zip(models, model_names):
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix for {name}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 20/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 21/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 22/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 23/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# FEATURE IMPORTANCE PLOTS

for model, name in zip([dt_model, rf_model, gbm_model, xgb_model], ['Decision Tree', 'Random Forest', 'GBM', 'XGBoost']):
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title(f'Feature Importances in {name}')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.ylabel('Importance')
plt.show()

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 24/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 25/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 26/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# Precision Recall Curves

from sklearn.metrics import precision_recall_curve

# Plot precision-recall curve for each model


for model, name in zip(models, model_names):
if hasattr(model, "predict_proba"):
y_scores = model.predict_proba(X_test)[:, 1]
else:
y_scores = model.decision_function(X_test)
precision, recall, _ = precision_recall_curve(y_test, y_scores)
plt.plot(recall, precision, label=f'{name}')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve Comparison')
plt.legend(loc='upper right')
plt.show()

# LOSS CURVE FOR NN#


plt.figure()
plt.plot(nn_model.loss_curve_)
plt.title('Loss Curve for Neural Network')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 27/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab
# INTERACTIVE MODEL
import plotly.express as px

fig = px.scatter(df, x='Mean CV Accuracy', y='Model', color='Model')


fig.show()

Decision Tree

Random Forest

GBM

XGBoost

LightGBM
Model

AdaBoost

SVM

Neural Network

KNN

Logistic Regression

0.87 0.88 0.89

Mea

Start coding or generate with AI.

keyboard_arrow_down STATISTICAL SIGNIFICANCE


models = {
'Decision Tree': dt_model,
'Random Forest': rf_model,
'GBM': gbm_model,
'XGBoost': xgb_model,
'LightGBM': lgbm_model,
'AdaBoost': ada_model,
'SVM': svm_nl_model,
'Neural Network': nn_model,
'KNN': knn_model,
'Logistic Regression': logreg_l2
}

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 28/29
4/13/24, 1:54 AM COMPLETE VERSION 1 in Sha Allah.ipynb - Colab

https://colab.research.google.com/drive/1CX1tB6z4WAqIpKOYGjkfNMTYMmlkZMeS#scrollTo=mBbvmg5s0qBl&printMode=true 29/29

You might also like