Eda-Ml-Decision-Tree - Ipynb - Colab
Eda-Ml-Decision-Tree - Ipynb - Colab
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Sav
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/heart-disease/heart_disease.csv
keyboard_arrow_down DATASET:
This dataset contains various health indicators and risk factors related to heart disease. Parameters such as age, gender, blood pressure,
cholesterol levels, smoking habits, and exercise patterns have been collected to analyze heart disease risk and contribute to health research.
The dataset can be used by healthcare professionals, researchers, and data analysts to examine trends related to heart disease, identify risk
factors, and perform various health-related analyses.
Columns ;
Family Heart Disease: Whether there is a family history of heart disease (Yes or No).
High Blood Pressure: Whether the individual has high blood pressure (Yes or No).
Low HDL Cholesterol: Whether the individual has low HDL cholesterol (Yes or No).
High LDL Cholesterol: Whether the individual has high LDL cholesterol (Yes or No).
Alcohol Consumption: The individual's alcohol consumption level (None, Low, Medium, High).
Sugar Consumption: The individual's sugar consumption level (Low, Medium, High).
Homocysteine Level: The individual's homocysteine level (an amino acid that affects blood vessel health).
Heart Disease Status: The individual's heart disease status (Yes or No).
import pandas as pd
import plotly.express as px
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
df = pd.read_csv('/kaggle/input/heart-disease/heart_disease.csv')
df.head()
0 56.0 Male 153.0 155.0 High Yes Yes No 24.991591 Yes ... No High Med
1 69.0 Female 146.0 286.0 High No Yes Yes 25.221799 No ... No Medium H
3 32.0 Female 122.0 293.0 High Yes Yes No 24.130477 Yes ... Yes Low H
4 60.0 Male 166.0 242.0 Low Yes Yes Yes 20.486289 Yes ... No Low H
5 rows × 21 columns
df.shape
(10000, 21)
keyboard_arrow_down EDA
df.isnull().sum()
Age 29
Gender 19
Blood Pressure 19
Cholesterol Level 30
Exercise Habits 25
Smoking 25
Family Heart Disease 21
Diabetes 30
BMI 22
High Blood Pressure 26
Low HDL Cholesterol 25
High LDL Cholesterol 26
Alcohol Consumption 2586
Stress Level 22
Sleep Hours 25
Sugar Consumption 30
Triglyceride Level 26
Fasting Blood Sugar 22
CRP Level 26
Homocysteine Level 20
Heart Disease Status 0
dtype: int64
df.dtypes
Age float64
Gender object
Blood Pressure float64
Cholesterol Level float64
Exercise Habits object
Smoking object
Family Heart Disease object
Diabetes object
BMI float64
High Blood Pressure object
Low HDL Cholesterol object
High LDL Cholesterol object
Alcohol Consumption object
Stress Level object
Sleep Hours float64
Sugar Consumption object
Triglyceride Level float64
Fasting Blood Sugar float64
CRP Level float64
Homocysteine Level float64
Heart Disease Status object
dtype: object
df['Age'].describe()
count 9971.000000
mean 49.296259
std 18.193970
min 18.000000
25% 34.000000
50% 49.000000
75% 65.000000
max 80.000000
Name: Age, dtype: float64
lists = df.columns
lists
dataframe = df
invalid_column_for_displaying = []
for i in lists:
name = f'{i}'
data = dataframe[name]
if data.count() != data.value_counts().shape:
if data.isnull().sum() != 0 :
if data.dtype == 'float64' or data.dtype =='int64':
data = data.fillna(0)
else:
data= data.fillna('missing')
if data.unique().shape[0] <=10:
fig = px.pie(data,name,color=name,title=f'{name} Distribution')
fig.show()
else:
counts = data.value_counts().reset_index()
counts.columns =[name,'count']
print(f"List of columns in the data that couldn't be printed because there are an equal number of unique values as to the shape of the c
Age Distribution
182
182
174
173
173
172
169
168
168
167
166
162
162
161
161
160
160
160
159
157
157
150
156
156
155
154
154
154
152
151
151
149
149
149
149
148
147
145
142
142
139
132
128
121
count
100
50
29
0
0 10 20 30 40 50 60
Age
Gender Distribution
49.8% 50%
0.19%
214
200
178 178
177
174
172
171
169 169
162
161 161
140
136
count
100
50
19
0
0 20 40 60 80 100 120
Blood Pressure
90
84
83
80
81
78 78
77 77
76
75
73 73 73 73
72 72
70
71
70
69 69 69
68 68 68 68 68
67 67 67 67 67 67 67 67
66 66 66 66 66 66 66
65 65 65 65 65 65
64 64
63 63 63 63
62 62 62 62
60
61 61 61 61
60 60
59 59
58 58
57 57
56 56 56
count
52
50
51
49
47
46
40
30 30
20
10
0
0 50 100 150 200
Cholesterol Level
33.3% 33.7%
32.7% 0.25%
48.5%
51.2%
0.25%
49.8% 50%
0.21%
Diabetes Distribution
49.5% 50.2%
0.3%
High Blood Pressure Distribution
49.5% 50.2%
0.26%
49.8% 50%
0.25%
49.4% 50.4%
0.26%
Alcohol Consumption Distribution
25% 25.9%
24.9% 24.3%
33.2% 33.9%
32.7% 0.22%
33.3% 33.9%
32.5% 0.3%
Triglyceride Level Distribution
50
47 47
46
45 45
43 43 43 43
42 42 42
41 41 41 41 41 41
40 40
39 39
40 40 40 40
39 39
40
39
40 40 40 40
39
38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
37 37 37 37 37 37 37 37 37 37 37 37
36 36 36 36 36 36 36
35 35 35 35 35 35 35 35 35 35 35 35 35
34 34 34 34 34 34 34 34 34 34 34 34 34 34
33 33 33 33 33 33 33 33 33 33 33
32 32 32 32 32 32 32 32 32 32 32 32
31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31
30 29
30
29
30
29 29
30
29
30
29
30
29 29
30 30 30
29
30 30
29
30
29
28 28 28 28 28 28 28
count
27 27 27 27 27 27 27
26 26 26 26 26 26 26 26
25 25 25 25
24 24 24 24 24
23 23 23 23 23 23
22 22 22 22 22
21 21
20 20
10
0
0 50 100 150 200 250 30
Triglyceride Level
151
147
140
141
139
138
134
134
133
132
131
131
130
129
128
128
127
127
120
124
123
123
122
122
121
120
119
118
118
117
117
116
116
116
116
115
115
114
111
110
100
104
100
98
count
80
60
40
20
22
0
0 20 40 60 80 100 12
20%
80%
List of columns in the data that couldn't be printed because there are an equal number of unique values as to the shape of the co
['BMI', 'Sleep Hours', 'CRP Level', 'Homocysteine Level']
Based on the analysis, the columns in the list 'invalid_column_for_displaying' exhibit as many unique values as there are rows, indicating that
they likely contain no meaningful variability. Therefore, it is reasonable to conclude that these columns would not provide useful information
fore predicting the outcome of heart disease and can be removed.
'BMI',
'Sleep Hours',
'CRP Level',
'Homocysteine Level'
keyboard_arrow_down Removing invalid columns based on the displaying for each columns
df = df.drop(columns=invalid_column_for_displaying,axis=1)
df.columns
for i in df.columns:
if df[i].dtype == object:
columns_categorical.append(i)
else:
columns_numerical.append(i)
print(columns_categorical)
print(columns_numerical)
['Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL
['Age', 'Blood Pressure', 'Cholesterol Level', 'Triglyceride Level', 'Fasting Blood Sugar']
df['Gender'].unique()
df.shape
(10000, 17)
df.isnull().sum()
Age 29
Gender 19
Blood Pressure 19
Cholesterol Level 30
Exercise Habits 25
Smoking 25
Family Heart Disease 21
Diabetes 30
High Blood Pressure 26
Low HDL Cholesterol 25
High LDL Cholesterol 26
Alcohol Consumption 2586
Stress Level 22
Sugar Consumption 30
Triglyceride Level 26
Fasting Blood Sugar 22
Heart Disease Status 0
dtype: int64
dataframe = df.copy()
for i in columns_numerical:
name = f'{i}'
data = dataframe[name]
plt.figure(figsize=(8, 5))
sns.kdeplot(data, color="skyblue")
plt.show()
if skewness == 0:
print(f'{i} : {skewness:.2f} - Normally distributed')
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
Cholesterol Level : -0.01 - Left skew (Negative skew)
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning:
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
Age:
skewness = -0.01
missing = 0.29 %
imputer = median
Reason: The data is slightly left-skewed, and the median is more robust to outliers, ensuring a more accurate central tendency.
Blood Pressure:
skewness = 0.01
missing = 0.19 %
imputer = median
Reason: Since the skewness is close to 0, the data is nearly symmetrical, making the median a reliable choice that minimizes
distortion from outliers.
Cholesterol Level:
skewness = -0.01
missing = 0.30 %
imputer = median
Reason: A near-normal distribution but slightly skewed; using the median prevents the impact of extreme cholesterol values.
Triglyceride Level:
skewness = 0.01
missing = 0.26 %
imputer = median
Reason: The data is slightly right-skewed; the median is less affected by extreme values, providing a stable imputation.
skewness = -0.01
missing = 0.22 %
imputer = median
Reason: Given the slight skewness, the median helps avoid potential biases introduced by extreme blood sugar levels.
columns_categorical
['Gender',
'Exercise Habits',
'Smoking',
'Family Heart Disease',
'Diabetes',
'High Blood Pressure',
'Low HDL Cholesterol',
'High LDL Cholesterol',
'Alcohol Consumption',
'Stress Level',
'Sugar Consumption',
'Heart Disease Status']
df[columns_categorical].isnull().sum()
Gender 19
Exercise Habits 25
Smoking 25
Family Heart Disease 21
Diabetes 30
High Blood Pressure 26
Low HDL Cholesterol 25
High LDL Cholesterol 26
Alcohol Consumption 2586
Stress Level 22
Sugar Consumption 30
Heart Disease Status 0
dtype: int64
Based on the results for the missing values for the categorical columns, the following decision has been made:
Most frequent:
['Gender','Exercise Habits','Smoking','Family Heart Disease','Diabetes','High Blood Pressure','Low HDL Cholesterol','High LDL
Cholesterol','Stress Level','Sugar Consumption']
No action:
Remove column:
Alcohol Consumption, with 25,86 % missing data, Very high missing rate. missing values might indicate non-drinkers or data entry
issues. Dropping might be best.
['Age',
'Blood Pressure',
'Cholesterol Level',
'Triglyceride Level',
'Fasting Blood Sugar']
dataframe=df
dataframe
/usr/local/lib/python3.10/dist-packages/pandas/io/formats/format.py:1458: RuntimeWarning:
/usr/local/lib/python3.10/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning:
/usr/local/lib/python3.10/dist-packages/pandas/io/formats/format.py:1459: RuntimeWarning:
Family High
Blood Cholesterol Exercise Low HDL High LDL Alcohol Str
Age Gender Smoking Heart Diabetes Blood
Pressure Level Habits Cholesterol Cholesterol Consumption Le
Disease Pressure
0 56.0 Male 153.0 155.0 High Yes Yes No Yes Yes No High Med
3 32.0 Female 122.0 293.0 High Yes Yes No Yes No Yes Low H
4 60.0 Male 166.0 242.0 Low Yes Yes Yes Yes No No Low H
... ... ... ... ... ... ... ... ... ... ... ... ...
9995 25.0 Female 136.0 243.0 Medium Yes No No Yes No Yes Medium H
9997 73.0 Male 152.0 201.0 High Yes No Yes No Yes Yes NaN
9998 23.0 Male 142.0 299.0 Low Yes No Yes Yes No Yes Medium H
9999 38.0 Female 128.0 193.0 Medium Yes Yes Yes No Yes Yes High Med
name_columns_num = dataframe.columns
imputer_median = SimpleImputer(strategy='median',fill_value=np.nan)
column_trans = ColumnTransformer(
transformers=[
('impute_age', imputer_median, ['Age']),
('impute_blood_pressure', imputer_median, ['Blood Pressure']),
('impute_cholesterol', imputer_median, ['Cholesterol Level']),
('impute_triglyceride', imputer_median, ['Triglyceride Level']),
('impute_fasting_blood_sugar', imputer_median, ['Fasting Blood Sugar']),
],
remainder='passthrough'
)
df_transformed = column_trans.fit_transform(dataframe)
print(df_transformed)
new_name_columns = columns_numerical+columns_categorical
new_name_columns
['Age',
'Blood Pressure',
'Cholesterol Level',
'Triglyceride Level',
'Fasting Blood Sugar',
'Gender',
'Exercise Habits',
'Smoking',
'Family Heart Disease',
'Diabetes',
'High Blood Pressure',
'Low HDL Cholesterol',
'High LDL Cholesterol',
'Alcohol Consumption',
'Stress Level',
'Sugar Consumption',
'Heart Disease Status']
df_num_imputed = pd.DataFrame(df_transformed,columns=new_name_columns)
df_num_imputed.head()
0 56.0 153.0 155.0 342.0 120.0 Male High Yes Yes No Yes Yes N
1 69.0 146.0 286.0 133.0 157.0 Female High No Yes Yes No Yes N
3 32.0 122.0 293.0 293.0 94.0 Female High Yes Yes No Yes No Y
4 60.0 166.0 242.0 263.0 154.0 Male Low Yes Yes Yes Yes No N
dataframe = df_num_imputed[name_columns_num]
dataframe
Family High
Blood Cholesterol Exercise Low HDL High LDL Alcohol Str
Age Gender Smoking Heart Diabetes Blood
Pressure Level Habits Cholesterol Cholesterol Consumption Le
Disease Pressure
0 56.0 Male 153.0 155.0 High Yes Yes No Yes Yes No High Med
3 32.0 Female 122.0 293.0 High Yes Yes No Yes No Yes Low H
4 60.0 Male 166.0 242.0 Low Yes Yes Yes Yes No No Low H
... ... ... ... ... ... ... ... ... ... ... ... ...
9995 25.0 Female 136.0 243.0 Medium Yes No No Yes No Yes Medium H
9997 73.0 Male 152.0 201.0 High Yes No Yes No Yes Yes NaN
9998 23.0 Male 142.0 299.0 Low Yes No Yes Yes No Yes Medium H
9999 38.0 Female 128.0 193.0 Medium Yes Yes Yes No Yes Yes High Med
Age 0
Blood Pressure 0
Cholesterol Level 0
Triglyceride Level 0
Fasting Blood Sugar 0
dtype: int64
keyboard_arrow_down categorical
# Drop Alcohol Consumption
dataframe = dataframe.drop(columns='Alcohol Consumption')
cat_columnn='Alcohol Consumption'
columns_categorical = list(filter(lambda x:x!=cat_columnn,columns_categorical))
print(columns_categorical)
['Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL
dataframe[columns_categorical].isnull().sum()
Gender 19
Exercise Habits 25
Smoking 25
Family Heart Disease 21
Diabetes 30
High Blood Pressure 26
Low HDL Cholesterol 25
High LDL Cholesterol 26
Stress Level 22
Sugar Consumption 30
dtype: int64
imputer_most_freq = SimpleImputer(strategy='most_frequent',fill_value='missing')
column_trans_cat = ColumnTransformer(
transformers=[
('impute_gender',imputer_most_freq,['Gender']),
('impue_exercise',imputer_most_freq,['Exercise Habits']),
('impute_smoking',imputer_most_freq,['Smoking']),
('impute_f_h_d',imputer_most_freq,['Family Heart Disease']),
('impute_Diabetes',imputer_most_freq,['Diabetes']),
('impute_h_b_p',imputer_most_freq,['High Blood Pressure']),
('impute_l_hdl_c',imputer_most_freq,['Low HDL Cholesterol']),
('impute_h_ldl_c',imputer_most_freq,['High LDL Cholesterol']),
('impute_stress_level',imputer_most_freq,['Stress Level']),
('impute_sugar_consumption',imputer_most_freq,['Sugar Consumption'])
], remainder='passthrough'
)
new_name_columns = columns_categorical+columns_numerical
new_name_columns.append(feature_column)
new_name_columns
['Gender',
'Exercise Habits',
'Smoking',
'Family Heart Disease',
'Diabetes',
'High Blood Pressure',
'Low HDL Cholesterol',
'High LDL Cholesterol',
'Stress Level',
'Sugar Consumption',
'Age',
'Blood Pressure',
'Cholesterol Level',
'Triglyceride Level',
'Fasting Blood Sugar',
'Heart Disease Status']
df_transformed=column_trans_cat.fit_transform(dataframe)
df_cat_imputed = pd.DataFrame(df_transformed,columns=new_name_columns)
df_cat_imputed.head()
Family High
Exercise Low HDL High LDL Stress Sugar Blood Cholesterol
Gender Smoking Heart Diabetes Blood Age
Habits Cholesterol Cholesterol Level Consumption Pressure Level
Disease Pressure
0 Male High Yes Yes No Yes Yes No Medium Medium 56.0 153.0 155.0
1 Female High No Yes Yes No Yes No High Medium 69.0 146.0 286.0
3 Female High Yes Yes No Yes No Yes High High 32.0 122.0 293.0
4 Male Low Yes Yes Yes Yes No No High High 60.0 166.0 242.0
df_imputed_cleaned = df_cat_imputed
Exercise Family Heart High Blood Low HDL High LDL Stress Sugar
Gender Smoking Diabetes
Habits Disease Pressure Cholesterol Cholesterol Level Consumption
df_imputed_cleaned[columns_categorical].columns
df_imputed_cleaned['Diabetes'] = (df_imputed_cleaned['Diabetes']=='Yes').astype(int)
df_imputed_cleaned['Exercise Habits'].unique()
df_imputed_cleaned['Stress Level'].unique()
df_imputed_cleaned['Sugar Consumption'].unique()
ordinal_encoder = OrdinalEncoder(categories=[['Low','Medium','High']])
df_imputed_cleaned['Exercise Habits'] = ordinal_encoder.fit_transform(df_imputed_cleaned[['Exercise Habits']])
df_imputed_cleaned['Stress Level'] = ordinal_encoder.fit_transform(df_imputed_cleaned[['Stress Level']])
df imputed cleaned['Sugar Consumption'] = ordinal encoder.fit transform(df imputed cleaned[['Sugar Consumption']]
df_imputed_cleaned.head()
Family High
Exercise Low HDL High LDL Stress Sugar Blood Cholesterol
Gender Smoking Heart Diabetes Blood Age
Habits Cholesterol Cholesterol Level Consumption Pressure Level
Disease Pressure