0% found this document useful (0 votes)
3 views5 pages

Code

This document outlines a machine learning workflow using LightGBM for binary classification, including installation of necessary libraries, data loading, preprocessing, and hyperparameter optimization using Optuna. Key steps include scaling the data, defining an objective function for model training, and evaluating model performance based on AUC. The final model is trained with optimized parameters and predictions are made with a modified probability threshold for improved sensitivity.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views5 pages

Code

This document outlines a machine learning workflow using LightGBM for binary classification, including installation of necessary libraries, data loading, preprocessing, and hyperparameter optimization using Optuna. Key steps include scaling the data, defining an objective function for model training, and evaluating model performance based on AUC. The final model is trained with optimized parameters and predictions are made with a modified probability threshold for improved sensitivity.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 5

# 1.

Installation and Imports (Same as before)

!pip install pandas scikit-learn imbalanced-learn lightgbm optuna

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer #Commented


out

from sklearn.impute import IterativeImputer #Commented out

#from imblearn.over_sampling import BorderlineSMOTE #Commented out

from sklearn.metrics import classification_report, accuracy_score,


roc_auc_score # Using AUC more often than just accuracy.

import lightgbm as lgb

import optuna

from sklearn.preprocessing import StandardScaler, MinMaxScaler,


PolynomialFeatures # Added scalers

from sklearn.linear_model import LogisticRegression # Just one more


model

from sklearn.ensemble import RandomForestClassifier

# 2. Load the Preprocessed Dataset

data = pd.read_csv("processed_df.csv") # <------ REPLACE THIS WITH


YOUR PREPROCESSED DATASET PATH! AND SPECIFY SEPARATOR

#3 Define Target and Features

X = data.drop('cardio', axis=1)

y = data['cardio']

# 4. Train/Test Split (Same as before)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,


random_state=42)
# 5. Scaling the data

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

# 6. Imputation (OMITTED) (No more imputation in this version - be sure


to handle missing values previously)

#mice_imputer = IterativeImputer(random_state=0)

#X_train_imputed = mice_imputer.fit_transform(X_train_scaled) # Use the


scaled version.

#X_test_imputed = mice_imputer.transform(X_test_scaled) # Use the


scaled version.

X_train_imputed = X_train_scaled # Set imputted values as scaled values


in order to avoid bugs

X_test_imputed = X_test_scaled # Set imputted values as scaled values in


order to avoid bugs

# 7. Oversampling (OMITTED) (No BorderlineSMOTE)

#smote = BorderlineSMOTE(random_state=0)

#X_train_resampled, y_train_resampled =
smote.fit_resample(X_train_imputed, y_train) # SMOTE on SCALED and
IMPUTED data

X_train_resampled = X_train_imputed # set resampled values as inputted


values to avoid bugs

y_train_resampled = y_train # set resampled values as inputted values to


avoid bugs

# 8. Define the Objective Function for Hyperparameter Optimization


(Slight changes to Metrics)

def objective(trial):
params = {

'objective': 'binary',

'metric': 'auc', # Or try 'binary_logloss', 'binary_error'

'boosting_type': 'gbdt', # Experiment with 'dart' and 'goss'

'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 0.1),

'num_leaves': trial.suggest_int('num_leaves', 31, 200),

'max_depth': trial.suggest_int('max_depth', -1, 25),

'n_estimators': trial.suggest_int('n_estimators', 100, 500),

'min_child_samples': trial.suggest_int('min_child_samples', 10, 200),

'subsample': trial.suggest_float('subsample', 0.6, 1.0),

'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),

'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0), # Lambda l1 is


now called reg_alpha

'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0), #Lambda l2


is now called reg_lambda

'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),

'max_bin': trial.suggest_int('max_bin', 100, 255)

dtrain = lgb.Dataset(X_train_resampled, label=y_train_resampled)

# Train the model with early stopping to prevent overfitting

model = lgb.train(

params,

dtrain,

valid_sets=[dtrain],

callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
# Pass verbose=False to early_stopping

)
y_pred = model.predict(X_test_imputed) #Predict against the SCALED
test data.

return roc_auc_score(y_test, y_pred) # Optimize based on AUC

# 8. Optimize Hyperparameters (Same as before)

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=100, show_progress_bar=False) #Added


progress bar = false for readability

# 9. Train the Final Model (Slight Changes)

best_params = study.best_params

final_model = lgb.LGBMClassifier(**best_params)

final_model.fit(X_train_resampled, y_train_resampled)

# 10. Predictions and Evaluation (Use AUC and a more granular probability
threshold.)

y_pred_proba = final_model.predict_proba(X_test_imputed)[:, 1] # Get


probabilities for the positive class

y_pred = (y_pred_proba > 0.4).astype(int) # Tweak the threshold from 0.5


to 0.4 to try and get better sensitivity.

print("Best Parameters:", best_params)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("AUC: ", roc_auc_score(y_test, y_pred_proba))


print(classification_report(y_test, y_pred))

You might also like