Brain Stroke Prediction Using ML - Jupyter Notebook
Brain Stroke Prediction Using ML - Jupyter Notebook
In [1]:
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams['figure.figsize'] = (5, 5)
In [5]:
data=pd.read_csv(r"C:\Users\91809\OneDrive\Desktop\healthcare-dataset-stroke-data.csv")
In [6]:
data
Out[6]:
Self-
1 51676 Female 61.0 0 0 Yes Rural
employed
Self-
4 1665 Female 79.0 1 0 Yes Rural
employed
Self-
5106 44873 Female 81.0 0 0 Yes Urban
employed
#exploratory analysis
In [7]:
data.shape
Out[7]:
(5110, 12)
In [8]:
data.info
Out[8]:
stroke
0 1
1 1
2 1
3 1
4 1
... ...
5105 0
5106 0
5107 0
5108 0
5109 0
In [9]:
data.isnull().sum()
Out[9]:
id 0
gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 201
smoking_status 0
stroke 0
dtype: int64
In [10]:
Out[10]:
28.7 41
28.4 38
26.7 37
27.6 37
26.1 37
..
48.7 1
49.2 1
51.0 1
49.4 1
14.9 1
Name: bmi, Length: 418, dtype: int64
In [11]:
data['bmi'].describe()
Out[11]:
count 4909.000000
mean 28.893237
std 7.854067
min 10.300000
25% 23.500000
50% 28.100000
75% 33.100000
max 97.600000
Name: bmi, dtype: float64
In [12]:
data['bmi'].fillna(data['bmi'].mean(),inplace=True)
In [13]:
data['bmi'].describe()
Out[13]:
count 5110.000000
mean 28.893237
std 7.698018
min 10.300000
25% 23.800000
50% 28.400000
75% 32.800000
max 97.600000
Name: bmi, dtype: float64
In [14]:
data.drop('id',axis=1,inplace=True)
In [15]:
data
Out[15]:
Self-
1 Female 61.0 0 0 Yes Rural
employed
Self-
4 Female 79.0 1 0 Yes Rural
employed
Self-
5106 Female 81.0 0 0 Yes Urban
employed
Self-
5107 Female 35.0 0 0 Yes Rural
employed
In [16]:
#Outlier Removation
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=800, facecolor='w', edgecolor='k')
Out[16]:
In [17]:
data.plot(kind='box')
plt.show()
In [18]:
data.head()
Out[18]:
Self-
1 Female 61.0 0 0 Yes Rural
employed
Self-
4 Female 79.0 1 0 Yes Rural
employed
In [19]:
In [20]:
gender=enc.fit_transform(data['gender'])
In [21]:
smoking_status=enc.fit_transform(data['smoking_status'])
In [22]:
work_type=enc.fit_transform(data['work_type'])
Residence_type=enc.fit_transform(data['Residence_type'])
ever_married=enc.fit_transform(data['ever_married'])
In [23]:
data['work_type']=work_type
In [24]:
data['ever_married']=ever_married
data['Residence_type']=Residence_type
data['smoking_status']=smoking_status
data['gender']=gender
In [25]:
data
Out[25]:
0 1 67.0 0 1 1 2 1
1 0 61.0 0 0 1 3 0
2 1 80.0 0 1 1 2 0
3 0 49.0 0 0 1 2 1
4 0 79.0 1 0 1 3 0
5105 0 80.0 1 0 1 2 1
5106 0 81.0 0 0 1 3 1
5107 0 35.0 0 0 1 3 0
5108 1 51.0 0 0 1 2 0
5109 0 44.0 0 0 1 0 1
In [26]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 5110 non-null int32
1 age 5110 non-null float64
2 hypertension 5110 non-null int64
3 heart_disease 5110 non-null int64
4 ever_married 5110 non-null int32
5 work_type 5110 non-null int32
6 Residence_type 5110 non-null int32
7 avg_glucose_level 5110 non-null float64
8 bmi 5110 non-null float64
9 smoking_status 5110 non-null int32
10 stroke 5110 non-null int64
dtypes: float64(3), int32(5), int64(3)
memory usage: 339.5 KB
In [27]:
X=data.drop('stroke',axis=1)
In [28]:
X.head()
Out[28]:
0 1 67.0 0 1 1 2 1
1 0 61.0 0 0 1 3 0
2 1 80.0 0 1 1 2 0
3 0 49.0 0 0 1 2 1
4 0 79.0 1 0 1 3 0
In [29]:
Y=data['stroke']
In [30]:
Out[30]:
0 1
1 1
2 1
3 1
4 1
..
5105 0
5106 0
5107 0
5108 0
5109 0
Name: stroke, Length: 5110, dtype: int64
In [68]:
In [69]:
X_train
Out[69]:
2285 1 49.0 0 0 1 2 0
4733 1 67.0 0 0 1 2 0
3905 1 78.0 0 0 1 2 1
4700 1 47.0 0 0 1 2 0
4939 0 59.0 0 0 1 2 1
1180 0 62.0 0 0 1 2 0
3441 0 59.0 0 0 1 3 1
1344 1 47.0 0 0 1 2 0
4623 1 25.0 0 0 1 0 1
1289 0 80.0 0 0 1 3 0
In [70]:
Y_train
Out[70]:
2285 0
4733 0
3905 0
4700 0
4939 0
..
1180 0
3441 0
1344 0
4623 0
1289 0
Name: stroke, Length: 4088, dtype: int64
In [71]:
X_test
Out[71]:
2413 0 58.00 0 0 1 2 0
1141 1 57.00 0 0 1 2 0
146 1 65.00 0 0 1 3 1
3883 0 1.64 0 0 0 4 1
1044 0 79.00 0 0 1 0 1
2261 1 59.00 0 0 1 2 1
4712 1 57.00 0 0 1 2 1
4971 0 63.00 0 0 1 2 1
2224 1 57.00 0 0 1 2 0
4825 0 14.00 0 0 0 4 1
In [72]:
Y_test
Out[72]:
2413 0
1141 0
146 1
3883 0
1044 0
..
2261 0
4712 0
4971 0
2224 0
4825 0
Name: stroke, Length: 1022, dtype: int64
In [73]:
data.describe()
Out[73]:
In [74]:
In [75]:
X_train_std=std.fit_transform(X_train)
X_test_std=std.transform(X_test)
In [39]:
import pickle
In [40]:
In [116]:
X_train_std
Out[116]:
In [42]:
X_test_std
Out[42]:
In [43]:
In [44]:
dt.fit(X_train_std,Y_train)
Out[44]:
▾ DecisionTreeClassifier
DecisionTreeClassifier()
In [45]:
dt.feature_importances_
Out[45]:
In [46]:
X_train.columns
Out[46]:
In [47]:
Y_pred=dt.predict(X_test_std)
In [48]:
Y_pred
Out[48]:
In [49]:
In [50]:
ac_dt=accuracy_score(Y_test,Y_pred)
In [51]:
ac_dt
Out[51]:
0.901174168297456
In [52]:
In [53]:
lr.fit(X_train_std,Y_train)
Out[53]:
▾ LogisticRegression
LogisticRegression()
In [54]:
Y_pred_lr=lr.predict(X_test_std)
In [55]:
Y_pred_lr
Out[55]:
In [56]:
ac_lr=accuracy_score(Y_test,Y_pred_lr)
In [57]:
ac_lr
Out[57]:
0.9383561643835616
In [76]:
In [77]:
knn.fit(X_train_std,Y_train)
Out[77]:
▾ KNeighborsClassifier
KNeighborsClassifier()
In [78]:
Y_pred=knn.predict(X_test_std)
In [79]:
ac_knn=accuracy_score(Y_test,Y_pred)
In [80]:
ac_knn
Out[80]:
0.9344422700587084
In [81]:
In [82]:
rf.fit(X_train_std,Y_train)
Out[82]:
▾ RandomForestClassifier
RandomForestClassifier()
In [83]:
Y_pred=rf.predict(X_test_std)
In [84]:
ac_rf=accuracy_score(Y_test,Y_pred)
In [85]:
ac_rf
Out[85]:
0.9363992172211351
In [110]:
models_params = {
'rand':{
'model':RandomForestClassifier(),
'params' :{
'n_estimators':[100,120,130]
}
},
'logi':{
'model':LogisticRegression(solver='liblinear',multi_class='auto'),
'params':{
'C':[ 1,5,10 ]
}
},
'knn':{
'model':KNeighborsClassifier(),
'params':{
'n_neighbors':[5,6,8]
}
}
models
In [ ]:
In [93]:
In [113]:
In [ ]: