Student Dropout
Student Dropout
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score,
accuracy_score, f1_score, classification_report, confusion_matrix,
ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay
df = pd.read_csv("/content/drive/MyDrive/data.csv",sep=";")
df
{"type":"dataframe","variable_name":"df"}
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
# Column Non-Null Count
Dtype
--- ------ --------------
-----
0 Marital status 4424 non-null
int64
1 Application mode 4424 non-null
int64
2 Application order 4424 non-null
int64
3 Course 4424 non-null
int64
4 Daytime/evening attendance 4424 non-null
int64
5 Previous qualification 4424 non-null
int64
6 Previous qualification (grade) 4424 non-null
float64
7 Nacionality 4424 non-null
int64
8 Mother's qualification 4424 non-null
int64
9 Father's qualification 4424 non-null
int64
10 Mother's occupation 4424 non-null
int64
11 Father's occupation 4424 non-null
int64
12 Admission grade 4424 non-null
float64
13 Displaced 4424 non-null
int64
14 Educational special needs 4424 non-null
int64
15 Debtor 4424 non-null
int64
16 Tuition fees up to date 4424 non-null
int64
17 Gender 4424 non-null
int64
18 Scholarship holder 4424 non-null
int64
19 Age at enrollment 4424 non-null
int64
20 International 4424 non-null
int64
21 Curricular units 1st sem (credited) 4424 non-null
int64
22 Curricular units 1st sem (enrolled) 4424 non-null
int64
23 Curricular units 1st sem (evaluations) 4424 non-null
int64
24 Curricular units 1st sem (approved) 4424 non-null
int64
25 Curricular units 1st sem (grade) 4424 non-null
float64
26 Curricular units 1st sem (without evaluations) 4424 non-null
int64
27 Curricular units 2nd sem (credited) 4424 non-null
int64
28 Curricular units 2nd sem (enrolled) 4424 non-null
int64
29 Curricular units 2nd sem (evaluations) 4424 non-null
int64
30 Curricular units 2nd sem (approved) 4424 non-null
int64
31 Curricular units 2nd sem (grade) 4424 non-null
float64
32 Curricular units 2nd sem (without evaluations) 4424 non-null
int64
33 Unemployment rate 4424 non-null
float64
34 Inflation rate 4424 non-null
float64
35 GDP 4424 non-null
float64
36 Target 4424 non-null
object
dtypes: float64(7), int64(29), object(1)
memory usage: 1.2+ MB
df.isnull().sum()
Marital status 0
Application mode 0
Application order 0
Course 0
Daytime/evening attendance\t 0
Previous qualification 0
Previous qualification (grade) 0
Nacionality 0
Mother's qualification 0
Father's qualification 0
Mother's occupation 0
Father's occupation 0
Admission grade 0
Displaced 0
Educational special needs 0
Debtor 0
Tuition fees up to date 0
Gender 0
Scholarship holder 0
Age at enrollment 0
International 0
Curricular units 1st sem (credited) 0
Curricular units 1st sem (enrolled) 0
Curricular units 1st sem (evaluations) 0
Curricular units 1st sem (approved) 0
Curricular units 1st sem (grade) 0
Curricular units 1st sem (without evaluations) 0
Curricular units 2nd sem (credited) 0
Curricular units 2nd sem (enrolled) 0
Curricular units 2nd sem (evaluations) 0
Curricular units 2nd sem (approved) 0
Curricular units 2nd sem (grade) 0
Curricular units 2nd sem (without evaluations) 0
Unemployment rate 0
Inflation rate 0
GDP 0
Target 0
dtype: int64
df.shape
(4424, 37)
df.size
163688
df.describe().T
df['Target'].value_counts()
Graduate 2209
Dropout 1421
Enrolled 794
Name: Target, dtype: int64
df['Target'] = LabelEncoder().fit_transform(df['Target'])
df['Target'].value_counts()
2 2209
0 1421
1 794
Name: Target, dtype: int64
plt.figure(figsize=(5,10))
sns.displot(df['Target'],color = "Red")
<seaborn.axisgrid.FacetGrid at 0x7a50ded55cc0>
<matplotlib.legend.Legend at 0x7a50dd113f70>
plt.figure(figsize=(8, 8))
plt.title("Gender")
plt.pie(df['Gender'].value_counts(), labels = ['Male', 'Female'],
explode = (0.1, 0.0), autopct='%1.2f%%', shadow = True)
plt.legend( loc = 'lower right')
<matplotlib.legend.Legend at 0x7a50dd0467d0>
plt.figure(figsize=(20, 45))
<ipython-input-95-e4734a4c6b05>:5: UserWarning:
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
sns.distplot(df.iloc[:, i], color='blue')
<ipython-input-95-e4734a4c6b05>:5: UserWarning:
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
/usr/local/lib/python3.10/dist-packages/seaborn/utils.py:61:
UserWarning: Glyph 9 ( ) missing from current font.
fig.canvas.draw()
["Tuition fees up to date","Curricular units 1st sem
(approved)","Curricular units 1st sem (grade)","Curricular units 2nd
sem (approved)","Curricular units 2nd sem (grade)"]
corr_matrix["Target"]
{"type":"dataframe","variable_name":"df"}
df['Dropout'] = df['Target'].apply(lambda x: 1 if x==0 else 0)
df
{"type":"dataframe","variable_name":"df"}
{"type":"dataframe","variable_name":"df"}
plt.figure(figsize=(5, 10))
sns.distplot(df['Dropout'], color = "red")
<ipython-input-101-f800bf5fae0e>:2: UserWarning:
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
sns.distplot(df['Dropout'], color = "red")
<matplotlib.legend.Legend at 0x7a50e22241c0>
plt.figure(figsize=(8, 8))
plt.title("Dropout Status")
plt.pie(df['Dropout'].value_counts(), labels = ['Non-Dropout',
'Dropout'], explode = (0.2, 0.0), autopct='%1.2f%%', shadow = True)
plt.legend( loc = 'lower right')
<matplotlib.legend.Legend at 0x7a50e2103b50>
x = df.iloc[:, :36].values
#x = df[["Tuition fees up to date","Curricular units 1st sem
(approved)","Curricular units 1st sem (grade)","Curricular units 2nd
sem (approved)","Curricular units 2nd sem (grade)"]].values
print(x)
x = StandardScaler().fit_transform(x)
x
y = df['Dropout'].values
y
def perform(y_pred):
print("Precision : ", precision_score(y_test, y_pred, average =
'micro'))
print("Recall : ", recall_score(y_test, y_pred, average =
'micro'))
print("Accuracy : ", accuracy_score(y_test, y_pred))
print("F1 Score : ", f1_score(y_test, y_pred, average = 'micro'))
cm = confusion_matrix(y_test, y_pred)
print("\n", cm)
print("\n")
print("**"*27 + "\n" + " "*16 + "Classification Report\n" +
"**"*27)
print(classification_report(y_test, y_pred))
print("**"*27+"\n")
cm = ConfusionMatrixDisplay(confusion_matrix = cm,
display_labels=['Non-Dropout', 'Dropout'])
cm.plot()
gaussian navie bayes
model_nb = GaussianNB()
model_nb.fit(x_train, y_train)
GaussianNB()
y_pred_nb = model_nb.predict(x_test)
perform(y_pred_nb)
Precision : 0.8457300275482094
Recall : 0.8457300275482094
Accuracy : 0.8457300275482094
F1 Score : 0.8457300275482094
[[405 43]
[ 69 209]]
******************************************************
Classification Report
******************************************************
precision recall f1-score support
******************************************************
logistic regerssion
model_lr = LogisticRegression()
model_lr.fit(x_train, y_train)
LogisticRegression()
model_svc = SVC(C=0.1,kernel='linear')
model_svc.fit(x_train, y_train)
SVC(C=0.1, kernel='linear')
y_pred_lr = model_lr.predict(x_test)
perform(y_pred_lr)
Precision : 0.9146005509641874
Recall : 0.9146005509641874
Accuracy : 0.9146005509641874
F1 Score : 0.9146005509641874
[[429 19]
[ 43 235]]
******************************************************
Classification Report
******************************************************
precision recall f1-score support
******************************************************
random forest
model_rf =
RandomForestClassifier(n_estimators=500,criterion='entropy')
model_rf.fit(x_train, y_train)
RandomForestClassifier(criterion='entropy', n_estimators=500)
y_pred_rf = model_rf.predict(x_test)
perform(y_pred_rf)
Precision : 0.9214876033057852
Recall : 0.9214876033057852
Accuracy : 0.9214876033057852
F1 Score : 0.9214876033057853
[[434 14]
[ 43 235]]
******************************************************
Classification Report
******************************************************
precision recall f1-score support
******************************************************
support vector classifier
y_pred_svc = model_svc.predict(x_test)
perform(y_pred_svc)
Precision : 0.9214876033057852
Recall : 0.9214876033057852
Accuracy : 0.9214876033057852
F1 Score : 0.9214876033057853
[[436 12]
[ 45 233]]
******************************************************
Classification Report
******************************************************
precision recall f1-score support
******************************************************
perceptron
model_mlp = Perceptron(alpha=0.001,l1_ratio=0.5,max_iter=100)
model_mlp.fit(x_train, y_train)
y_pred_mlp = model_mlp.predict(x_test)
perform(y_pred_mlp)
Precision : 0.8939393939393939
Recall : 0.8939393939393939
Accuracy : 0.8939393939393939
F1 Score : 0.8939393939393939
[[416 32]
[ 45 233]]
******************************************************
Classification Report
******************************************************
precision recall f1-score support
0 0.90 0.93 0.92 448
1 0.88 0.84 0.86 278
******************************************************
error = []
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red',
linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
model_knn = KNeighborsClassifier(n_neighbors=3)
model_knn.fit(x_train, y_train)
KNeighborsClassifier(n_neighbors=3)
y_pred_knn = model_knn.predict(x_test)
perform(y_pred_knn)
Precision : 0.859504132231405
Recall : 0.859504132231405
Accuracy : 0.859504132231405
F1 Score : 0.859504132231405
[[422 26]
[ 76 202]]
******************************************************
Classification Report
******************************************************
precision recall f1-score support
0 0.85 0.94 0.89 448
1 0.89 0.73 0.80 278
******************************************************
pred=[y_pred_nb,y_pred_lr,y_pred_rf,y_pred_svc,y_pred_mlp,y_pred_knn]
acc=[]
classifiers=["NaiveBayes","Logistic
Regression","RandomForest","Support Vector
Classier","Perceptron","KNN"]
for i in pred:
temp=accuracy_score(y_test, i)
acc.append(temp)
plt.barh(classifiers, acc)