Jupyter Notebook On Obesity Prediction
Jupyter Notebook On Obesity Prediction
March 2, 2024
1 Introduction
This notebook is created to facilitate EDA and to create a machine learning model to predict
Multi-Class obesity risk
[1]: import os
new_directory = 'C:/Users/sm94c/Documents/Data Science Projects/Multi Class␣
↪Obesity Prediction'
os.chdir(new_directory)
df = pd.read_csv("train.csv")
[3]: df.head()
1
[4]: df.info()
#no missing values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 20758 non-null int64
1 Gender 20758 non-null object
2 Age 20758 non-null float64
3 Height 20758 non-null float64
4 Weight 20758 non-null float64
5 family_history_with_overweight 20758 non-null object
6 FAVC 20758 non-null object
7 FCVC 20758 non-null float64
8 NCP 20758 non-null float64
9 CAEC 20758 non-null object
10 SMOKE 20758 non-null object
11 CH2O 20758 non-null float64
12 SCC 20758 non-null object
13 FAF 20758 non-null float64
14 TUE 20758 non-null float64
15 CALC 20758 non-null object
16 MTRANS 20758 non-null object
17 NObeyesdad 20758 non-null object
dtypes: float64(8), int64(1), object(9)
memory usage: 2.9+ MB
summary(df)
2
Height float64 0 0.0 1833 1.45
Weight float64 0 0.0 1979 39.0
family_history_with_overweight object 0 0.0 2 NaN
FAVC object 0 0.0 2 NaN
FCVC float64 0 0.0 934 1.0
NCP float64 0 0.0 689 1.0
CAEC object 0 0.0 4 NaN
SMOKE object 0 0.0 2 NaN
CH2O float64 0 0.0 1506 1.0
SCC object 0 0.0 2 NaN
FAF float64 0 0.0 1360 0.0
TUE float64 0 0.0 1297 0.0
CALC object 0 0.0 3 NaN
MTRANS object 0 0.0 5 NaN
NObeyesdad object 0 0.0 7 NaN
max
id 20757.0
Gender NaN
Age 61.0
Height 1.975663
Weight 165.057269
family_history_with_overweight NaN
FAVC NaN
FCVC 3.0
NCP 4.0
CAEC NaN
SMOKE NaN
CH2O 3.0
SCC NaN
FAF 3.0
TUE 2.0
CALC NaN
MTRANS NaN
NObeyesdad NaN
[6]: df.size
[6]: 373644
[7]: df.shape
#18 columns, 20K entries.
3
2 EDA
Let’s explore the relationship between different variables within the dataset.
[9]: plt.style.use('ggplot')
4
[11]: (array([0, 1, 2, 3, 4, 5, 6]),
[Text(0, 0, 'Overweight_Level_II'),
Text(1, 0, 'Normal_Weight'),
Text(2, 0, 'Insufficient_Weight'),
Text(3, 0, 'Obesity_Type_III'),
Text(4, 0, 'Obesity_Type_II'),
Text(5, 0, 'Overweight_Level_I'),
Text(6, 0, 'Obesity_Type_I')])
[12]: sns.countplot(df,x='NObeyesdad',hue='MTRANS')
plt.xlabel("Weight Class")
plt.xticks(rotation=90)
5
[12]: (array([0, 1, 2, 3, 4, 5, 6]),
[Text(0, 0, 'Overweight_Level_II'),
Text(1, 0, 'Normal_Weight'),
Text(2, 0, 'Insufficient_Weight'),
Text(3, 0, 'Obesity_Type_III'),
Text(4, 0, 'Obesity_Type_II'),
Text(5, 0, 'Overweight_Level_I'),
Text(6, 0, 'Obesity_Type_I')])
• Entries of obesity type 2 generally use more automobiles than other classes
• Normal Weight entries walk more than other classes
[13]: sns.scatterplot(df,x='Height',y='Weight',alpha=0.05,␣
↪hue='family_history_with_overweight')
6
[13]: <Axes: xlabel='Height', ylabel='Weight'>
[15]: df_eng.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 20758 non-null int64
1 Gender 20758 non-null object
2 Age 20758 non-null float64
3 Height 20758 non-null float64
7
4 Weight 20758 non-null float64
5 family_history_with_overweight 20758 non-null object
6 FAVC 20758 non-null object
7 FCVC 20758 non-null float64
8 NCP 20758 non-null float64
9 CAEC 20758 non-null object
10 SMOKE 20758 non-null object
11 CH2O 20758 non-null float64
12 SCC 20758 non-null object
13 FAF 20758 non-null float64
14 TUE 20758 non-null float64
15 CALC 20758 non-null object
16 MTRANS 20758 non-null object
17 NObeyesdad 20758 non-null object
18 BMI 20758 non-null float64
dtypes: float64(9), int64(1), object(9)
memory usage: 3.0+ MB
X = df_eng.drop('NObeyesdad',axis=1)
y = df['NObeyesdad']
8
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
],
remainder='passthrough'
)
def objective_xgb(trial):
params = {
"eval_metric": "mlogloss",
"objective": "multi:softmax",
"booster": trial.suggest_categorical("booster", ["gbtree"]),
"grow_policy": trial.suggest_categorical("grow_policy", ["depthwise",␣
↪"lossguide"]),
9
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', model_xgb)
])
study_xgboost = optuna.create_study(
study_name="Study_XGB_Obesity", direction="maximize"
)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_xgboost.optimize(objective_xgb, n_trials=1000, show_progress_bar=True)
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
10
('cat', categorical_transformer, categorical_cols)
],
remainder='passthrough'
)
new_params = {
'booster': 'gbtree',
'grow_policy': 'lossguide',
'n_estimators': 575,
'learning_rate': 0.03289752563023217,
'gamma': 0.10216792503966325,
'subsample': 0.9781875703877861,
'colsample_bytree': 0.3000971735782186,
'max_depth': 20,
'min_child_weight': 1,
'reg_lambda': 7.404755224243125e-08,
'reg_alpha': 2.2740450488861006
}
[39]: Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num', StandardScaler(),
Index(['id', 'Age', 'Height',
'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
'BMI'],
dtype='object')),
('cat',
Pipeline(steps=[('encoder',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=-1))]),
Index(['Gender',
'family_history_with_over…
grow_policy='lossguide', importance_type=None,
interaction_constraints=None,
learning_rate=0.03289752563023217, max_bin=None,
11
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20,
max_leaves=None, min_child_weight=1, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=575, n_jobs=None,
num_parallel_tree=None,
objective='multi:softprob', …))])
12
4 Time to Make Predictions
[42]: df_test = pd.read_csv("test.csv")
[44]: df_test_eng.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13840 entries, 0 to 13839
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 13840 non-null int64
1 Gender 13840 non-null object
2 Age 13840 non-null float64
3 Height 13840 non-null float64
4 Weight 13840 non-null float64
13
5 family_history_with_overweight 13840 non-null object
6 FAVC 13840 non-null object
7 FCVC 13840 non-null float64
8 NCP 13840 non-null float64
9 CAEC 13840 non-null object
10 SMOKE 13840 non-null object
11 CH2O 13840 non-null float64
12 SCC 13840 non-null object
13 FAF 13840 non-null float64
14 TUE 13840 non-null float64
15 CALC 13840 non-null object
16 MTRANS 13840 non-null object
17 BMI 13840 non-null float64
dtypes: float64(9), int64(1), object(8)
memory usage: 1.9+ MB
predicted_original_values = label_encoder.inverse_transform(predictions)
predicted_original_values
[48]: result_df
[48]: id Prediction
0 20758 Obesity_Type_II
1 20759 Overweight_Level_I
2 20760 Obesity_Type_III
3 20761 Obesity_Type_I
4 20762 Obesity_Type_III
… … …
13835 34593 Overweight_Level_II
13836 34594 Normal_Weight
13837 34595 Insufficient_Weight
13838 34596 Normal_Weight
13839 34597 Obesity_Type_II
14
[49]: result_df.to_csv("Submission_7_Optuna.csv",index=False)
[ ]:
15