Logistic Regression 205
Logistic Regression 205
In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as st
import seaborn as sn
%matplotlib inline
2. Importing Dataset ¶
In [2]:
df = pd.read_csv('heart_disease.csv')
df.drop(['education'],axis=1,inplace=True)
df.rename(columns={'male':'sex_male'},inplace=True)
df.head()
Out[2]:
0 1 39 0 0.0 0.0 0 0
1 0 46 0 0.0 0.0 0 0
2 1 48 1 20.0 0.0 0 0
3 0 61 1 30.0 0.0 0 1
4 0 46 1 23.0 0.0 0 0
df.isnull().sum()
Out[3]:
sex_male 0
age 0
currentSmoker 0
cigsPerDay 29
BPMeds 53
prevalentStroke 0
prevalentHyp 0
diabetes 0
totChol 50
sysBP 0
diaBP 0
BMI 19
heartRate 1
glucose 388
TenYearCHD 0
dtype: int64
In [4]:
count = 0
for i in df.isnull().sum(axis=1):
if i > 0:
count = count+1
df.dropna(axis=0, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
5. Data Visualization
In [6]:
fig=plt.figure(figsize=(20,20))
ax=fig.add_subplot(rows,cols,i+1)
dataframe[feature].hist(bins=20,ax=ax,facecolor='red')
ax.set_title(feature+"Distribution", color='blue')
fig.tight_layout()
plt.show()
draw_histograms(df, df.columns, 6, 3)
In [7]:
sn.countplot(x='TenYearCHD', data=df)
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x224600f92e0>
In [8]:
df_constant = add_constant(df)
df_constant.head()
Out[8]:
cols = df_constant.columns[:-1]
result = model.fit()
result.summary()
Iterations 7
Out[9]:
while len(col_list)>0 :
model = sm.Logit(dep_var,data_frame[col_list])
result = model.fit(disp=0)
largest_pvalue = round(result.pvalues,3).nlargest(1)
if largest_pvalue[0]<(0.05):
return result
break
else:
col_list = col_list.drop(largest_pvalue.index)
result.summary()
Out[10]:
params = np.exp(result.params)
conf = np.exp(result.conf_int())
conf['OR'] = params
pvalue = round(result.pvalues,3)
conf['pvalue'] = pvalue
print((conf))
In [15]:
import sklearn
new_features = df[['age','sex_male','cigsPerDay','totChol','sysBP','glucose','TenYearC
HD']]
x = new_features.iloc[:,:-1]
y = new_features.iloc[:,-1]
In [ ]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
In [18]:
print("Model Accuracy:")
sklearn.metrics.accuracy_score(y_test,y_pred)
Model Accuracy:
Out[18]:
0.8706666666666667