Assignment 3
Assignment 3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [6]: data.head()
In [8]: data.tail()
In [10]: data.shape
Out[10]: (400, 9)
In [12]: data.columns
Out[12]: Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
dtype='object')
In [16]: data
In [20]: data
Missing values:
In [24]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 GRE Score 400 non-null int64
1 TOEFL Score 400 non-null int64
2 University Rating 400 non-null int64
3 SOP 400 non-null float64
4 LOR 400 non-null float64
5 CGPA 400 non-null float64
6 Research 400 non-null int64
7 Chance of Admit 400 non-null int64
dtypes: float64(3), int64(5)
memory usage: 25.1 KB
In [26]: data.corr()
Out[26]: Cha
GRE TOEFL University
SOP LOR CGPA Research
Score Score Rating
Ad
GRE
1.000000 0.835977 0.668976 0.612831 0.557555 0.833060 0.580391 0.390
Score
TOEFL
0.835977 1.000000 0.695590 0.657981 0.567721 0.828417 0.489858 0.393
Score
University
0.668976 0.695590 1.000000 0.734523 0.660123 0.746479 0.447783 0.279
Rating
Chance of
0.390875 0.393121 0.279316 0.285939 0.353341 0.455949 0.216193 1.000
Admit
In [28]: plt.figure(figsize=(6,6))
sns.heatmap(data.corr(), annot=True, cmap='Oranges')
plt.show()
In [38]: data['SOP'].value_counts().plot(kind='pie',figsize=(5,5),autopct='%1.1f%%')
plt.title("SOP Point Chart")
plt.show()
In [50]: sns.pairplot(data)
In [54]: X.nunique()
(320, 7) (320,)
(80, 7) (80,)
Out[58]: ▾ DecisionTreeClassifier i ?
DecisionTreeClassifier()
print(classification_report(y_test, y_test_tree))
accuracy 0.86 80
macro avg 0.68 0.66 0.67 80
weighted avg 0.86 0.86 0.86 80
In [66]: plt.barh(X.columns,tree.feature_importances_)
plt.title("Feature Importances while constructing Tree")
plt.show()
In [70]: training_accuracy = []
test_accuracy = []
# try max_depth from 1 to 15
depth = range(1,16)
for n in depth:
tree_test = DecisionTreeClassifier(max_depth=n)
tree_test.fit(X_train, y_train)
# record training set accuracy
training_accuracy.append(tree_test.score(X_train, y_train))
# record generalization accuracy
test_accuracy.append(tree_test.score(X_test, y_test))
In [ ]: