Predict Students' Dropout and Academic Success Using Machine Learning Techniques
Predict Students' Dropout and Academic Success Using Machine Learning Techniques
In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('student_dropout.csv')
In [3]:
df.head()
Out[3]:
0 1 8 5 2 1 1 1
1 1 6 1 11 1 1 1
2 1 1 5 5 1 1 1
3 1 8 2 15 1 1 1
4 2 12 1 3 0 1 1
5 rows × 35 columns
In [4]:
df.tail()
Out[4]:
4419 1 1 6 15 1 1 1
4420 1 1 2 15 1 1 19
4421 1 1 1 12 1 1 1
4422 1 1 1 9 1 1 1
4423 1 5 1 15 1 1 9
5 rows × 35 columns
In [5]:
df.shape
Out[5]:
(4424, 35)
In [6]:
df.columns
Out[6]:
In [7]:
df.duplicated().sum()
Out[7]:
In [8]:
df.isnull().sum()
Out[8]:
Marital status 0
Application mode 0
Application order 0
Course 0
Daytime/evening attendance 0
Previous qualification 0
Nacionality 0
Mother's qualification 0
Father's qualification 0
Mother's occupation 0
Father's occupation 0
Displaced 0
Educational special needs 0
Debtor 0
Tuition fees up to date 0
Gender 0
Scholarship holder 0
Age at enrollment 0
International 0
Curricular units 1st sem (credited) 0
Curricular units 1st sem (enrolled) 0
Curricular units 1st sem (evaluations) 0
Curricular units 1st sem (approved) 0
Curricular units 1st sem (grade) 0
Curricular units 1st sem (without evaluations) 0
Curricular units 2nd sem (credited) 0
Curricular units 2nd sem (enrolled) 0
Curricular units 2nd sem (evaluations) 0
Curricular units 2nd sem (approved) 0
Curricular units 2nd sem (grade) 0
Curricular units 2nd sem (without evaluations) 0
Unemployment rate 0
Inflation rate 0
GDP 0
Target 0
dtype: int64
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Marital status 4424 non-null int64
1 Application mode 4424 non-null int64
2 Application order 4424 non-null int64
3 Course 4424 non-null int64
4 Daytime/evening attendance 4424 non-null int64
5 Previous qualification 4424 non-null int64
6 Nacionality 4424 non-null int64
7 Mother's qualification 4424 non-null int64
8 Father's qualification 4424 non-null int64
9 Mother's occupation 4424 non-null int64
10 Father's occupation 4424 non-null int64
11 Displaced 4424 non-null int64
12 Educational special needs 4424 non-null int64
13 Debtor 4424 non-null int64
14 Tuition fees up to date 4424 non-null int64
15 Gender 4424 non-null int64
16 Scholarship holder 4424 non-null int64
17 Age at enrollment 4424 non-null int64
18 International 4424 non-null int64
19 Curricular units 1st sem (credited) 4424 non-null int64
20 Curricular units 1st sem (enrolled) 4424 non-null int64
21 Curricular units 1st sem (evaluations) 4424 non-null int64
22 Curricular units 1st sem (approved) 4424 non-null int64
23 Curricular units 1st sem (grade) 4424 non-null float
64
24 Curricular units 1st sem (without evaluations) 4424 non-null int64
25 Curricular units 2nd sem (credited) 4424 non-null int64
26 Curricular units 2nd sem (enrolled) 4424 non-null int64
27 Curricular units 2nd sem (evaluations) 4424 non-null int64
28 Curricular units 2nd sem (approved) 4424 non-null int64
29 Curricular units 2nd sem (grade) 4424 non-null float
64
30 Curricular units 2nd sem (without evaluations) 4424 non-null int64
31 Unemployment rate 4424 non-null float
64
32 Inflation rate 4424 non-null float
64
33 GDP 4424 non-null float
64
34 Target 4424 non-null objec
t
dtypes: float64(5), int64(29), object(1)
memory usage: 1.2+ MB
In [10]:
df.describe()
Out[10]:
8 rows × 34 columns
In [11]:
df.nunique()
Out[11]:
Marital status 6
Application mode 18
Application order 8
Course 17
Daytime/evening attendance 2
Previous qualification 17
Nacionality 21
Mother's qualification 29
Father's qualification 34
Mother's occupation 32
Father's occupation 46
Displaced 2
Educational special needs 2
Debtor 2
Tuition fees up to date 2
Gender 2
Scholarship holder 2
Age at enrollment 46
International 2
Curricular units 1st sem (credited) 21
Curricular units 1st sem (enrolled) 23
Curricular units 1st sem (evaluations) 35
Curricular units 1st sem (approved) 23
Curricular units 1st sem (grade) 797
Curricular units 1st sem (without evaluations) 11
Curricular units 2nd sem (credited) 19
Curricular units 2nd sem (enrolled) 22
Curricular units 2nd sem (evaluations) 30
Curricular units 2nd sem (approved) 20
Curricular units 2nd sem (grade) 782
Curricular units 2nd sem (without evaluations) 10
Unemployment rate 10
Inflation rate 9
GDP 10
Target 3
dtype: int64
In [12]:
In [13]:
In [14]:
import warnings
warnings.filterwarnings('ignore')
In [15]:
for i in df1.columns:
plt.figure(figsize=(15,6))
sns.countplot(df1[i], data = df, palette = 'hls')
plt.xticks(rotation = 90)
plt.show()
In [16]:
for i in df1.columns:
plt.figure(figsize=(30,20))
plt.pie(df1[i].value_counts(), labels=df1[i].value_counts().index, autopct='%1.1f%%'
'color': 'black',
'weight': 'bold',
'family': 'serif' })
hfont = {'fontname':'serif', 'weight': 'bold'}
plt.title(i, size=20, **hfont)
plt.show()
In [17]:
for i in df.columns:
plt.figure(figsize=(15,6))
sns.histplot(df[i], kde = True, palette = 'hls')
plt.xticks(rotation = 90)
plt.show()
In [18]:
In [19]:
for i in df2.columns:
plt.figure(figsize=(15,6))
sns.barplot(x = df['Target'], y = df2[i], data = df, palette = 'hls')
plt.show()
In [20]:
for i in df2.columns:
plt.figure(figsize=(15,6))
sns.lineplot(x = df['Target'], y = df2[i], data = df, palette = 'hls')
plt.show()
In [21]:
for i in df2.columns:
plt.figure(figsize=(15,6))
sns.scatterplot(x = df['Target'], y = df2[i], data = df, palette = 'hls')
plt.show()
In [22]:
for i in df2.columns:
plt.figure(figsize=(15,6))
pd.crosstab(index=df2[i],columns=df['Target']).plot(kind='line')
plt.show()
In [23]:
df_corr = df.corr()
In [24]:
df_corr
Out[24]:
Application
0.224855 1.000000 -0.246497 -0.085116 -0.268616 0.433028
mode
Application
-0.125854 -0.246497 1.000000 0.118928 0.158657 -0.199029
order
Daytime/evening
-0.274939 -0.268616 0.158657 -0.070232 1.000000 -0.103022
attendance
Previous
0.120925 0.433028 -0.199029 -0.158382 -0.103022 1.000000
qualification
Mother's
0.185522 0.092867 -0.061719 0.058909 -0.195346 0.018868
qualification
Father's
0.128326 0.072798 -0.049936 0.045659 -0.137769 0.013152
qualification
Mother's
0.069734 0.033489 -0.046591 0.029672 -0.037986 0.006190
occupation
Father's
0.024351 0.001253 -0.029754 0.016489 0.000845 0.005381
occupation
Educational
-0.028343 -0.030868 0.025597 -0.001886 0.031017 -0.015015
special needs
Tuition fees up
-0.087158 -0.127339 0.055891 0.029099 0.038799 -0.095246
to date
Scholarship
-0.053765 -0.152818 0.073709 0.051668 0.093912 -0.085668
holder
Age at
0.522717 0.450700 -0.271154 -0.036929 -0.462280 0.249821
enrollment
Curricular units
1st sem 0.061209 0.238269 -0.133354 -0.140546 -0.127466 0.159940
(credited)
Curricular units
1st sem 0.052107 0.159547 -0.016808 0.112285 -0.043056 0.080860
(enrolled)
Curricular units
1st sem 0.058030 0.219154 -0.092156 0.025970 -0.045889 0.129364
(evaluations)
Curricular units
1st sem -0.031027 -0.023713 0.035580 0.077038 0.016935 -0.005295
(approved)
Curricular units
-0.059811 -0.106213 0.058308 0.179482 0.063974 -0.034252
1st sem (grade)
Curricular units
1st sem (without 0.034711 0.040255 -0.031699 -0.060483 0.045630 0.018276
evaluations)
Curricular units
2nd sem 0.062831 0.228973 -0.125815 -0.120390 -0.111953 0.138463
(credited)
Curricular units
2nd sem 0.039026 0.127461 0.028878 0.185879 0.000371 0.056450
(enrolled)
Curricular units
2nd sem 0.022784 0.164992 -0.055089 0.049236 0.014610 0.101501
(evaluations)
Curricular units
2nd sem -0.043739 -0.065203 0.071793 0.120000 0.034022 -0.037265
(approved)
Curricular units
-0.071506 -0.104424 0.055517 0.178997 0.050493 -0.038765
2nd sem (grade)
Curricular units
2nd sem
0.020426 0.042009 -0.015757 -0.013984 -0.004229 0.024186
(without
evaluations)
Unemployment
-0.020338 0.091567 -0.098419 -0.050116 0.061974 0.096914
rate
In [25]:
Inflation rate 0.008761 -0.019613 -0.011133 0.028775 -0.024043 -0.056388
import numpy as np
GDP -0.027003 -0.014563 0.030201 -0.012518 0.022929 0.053968
34 rows × 34 columns
In [26]:
plt.figure(figsize=(20, 17))
matrix = np.triu(df_corr)
sns.heatmap(df_corr, annot=True, linewidth=.8, mask=matrix, cmap="rocket");
plt.show()
In [27]:
df['Target']=df['Target'].map({
'Dropout':0,
'Enrolled':1,
'Graduate':2
})
In [28]:
X = df.drop('Target', axis = 1)
y = df['Target']
In [29]:
In [30]:
X = df.iloc[:,:-1]
In [31]:
In [32]:
In [33]:
top_10
Out[33]:
Feature Importance
Course 0.032460
In [34]:
X = X[['Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curri
'Curricular units 1st sem (grade)', 'Tuition fees up to date', 'Curricular units 2
'Age at enrollment', 'Course', 'Scholarship holder']]
In [35]:
In [36]:
scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)
In [37]:
In [38]:
In [41]:
In [39]:
Out[39]:
▾ LogisticRegression
LogisticRegression()
In [40]:
y_pred = lr.predict(X_test)
In [44]:
Accuracy: 0.7350813743218807
In [45]:
Out[45]:
▾ DecisionTreeClassifier
DecisionTreeClassifier()
In [46]:
y_pred = dt.predict(X_test)
In [47]:
Accuracy: 0.6754068716094033
In [48]:
Out[48]:
▾ RandomForestClassifier
RandomForestClassifier()
In [49]:
y_pred = rfc.predict(X_test)
In [50]:
Accuracy: 0.7486437613019892
In [51]:
In [52]:
In [53]:
def grid_search(model,folds,params,scoring):
grid_search = GridSearchCV(model,
cv=folds,
param_grid=params,
scoring=scoring,
n_jobs=-1, verbose=1)
return grid_search
In [54]:
def print_best_score_params(model):
print("Best Score: ", model.best_score_)
print("Best Hyperparameters: ", model.best_params_)
In [55]:
log_reg = LogisticRegression()
log_params = {'C': [0.01, 1, 10],
'penalty': ['l1', 'l2'],
'solver': ['liblinear','newton-cg','saga']
}
grid_search_log = grid_search(log_reg, folds, log_params, scoring=None)
grid_search_log.fit(X_train, y_train)
print_best_score_params(grid_search_log)
In [58]:
Out[58]:
▾ LogisticRegression
LogisticRegression(C=10, penalty='l1', solver='saga')
In [59]:
y_pred = lr.predict(X_test)
In [60]:
Accuracy: 0.740506329113924
In [61]:
dtc = DecisionTreeClassifier(random_state=40)
dtc_params = {
'max_depth': [5,10,20,30],
'min_samples_leaf': [5,10,20,30]
}
grid_search_dtc = grid_search(dtc, folds, dtc_params, scoring='roc_auc_ovr')
grid_search_dtc.fit(X_train, y_train)
print_best_score_params(grid_search_dtc)
In [62]:
Out[62]:
▾ DecisionTreeClassifier
DecisionTreeClassifier(max_depth=10, min_samples_leaf=30)
In [63]:
y_pred = dt.predict(X_test)
In [64]:
Accuracy: 0.7368896925858951
In [65]:
In [66]:
print_best_score_params(grid_search_rfc)
In [67]:
Out[67]:
▾ RandomForestClassifier
RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_estimators=20
0)
In [68]:
y_pred = rfc.predict(X_test)
In [69]:
Accuracy: 0.759493670886076