0% found this document useful (0 votes)
46 views25 pages

Multi Classification - Py (For 1 Class TP, TN, FP, FN)

Uploaded by

Niharika 480
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
46 views25 pages

Multi Classification - Py (For 1 Class TP, TN, FP, FN)

Uploaded by

Niharika 480
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 25

import boto3

import pandas as pd
import numpy as np
import random
import re
from fastapi.responses import JSONResponse
from fastapi import HTTPException
#added
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.special import softmax
from sklearn.preprocessing import label_binarize

import os
import io
import logging
from logging.handlers import RotatingFileHandler
from fastapi import HTTPException, Response
from fastapi.responses import JSONResponse
from botocore.exceptions import ClientError

from sklearn.model_selection import train_test_split


import optuna
from sklearn.impute import SimpleImputer
from dotenv import load_dotenv
import json
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score,
precision_score, recall_score, roc_auc_score, roc_curve,auc
from sklearn.model_selection import cross_val_predict # To get probabilities for
SVM

from sklearn.exceptions import DataConversionWarning


import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

# Suppress specific warning categories (e.g., DeprecationWarning)


warnings.filterwarnings("ignore", category=DeprecationWarning)

# Suppress warnings related to data conversions in scikit-learn


warnings.filterwarnings(action='ignore', category=DataConversionWarning)

logging.basicConfig(level=logging.INFO)

# Load environment variables


load_dotenv()

# S3 Config
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')
AWS_REGION = os.getenv('AWS_REGION')
S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')

# S3 Client setup
s3 = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_KEY,
region_name=AWS_REGION
)

# Getting the S3 file URL


def get_public_url(file_key):
file_url=f"s3://{S3_BUCKET_NAME}/{file_key}"
return file_url

# # Fetch dataset from S3

def fetch_data_from_s3(file_key, featuresList=None,target_column=None):


try:
s3_object = s3.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)

# Determine the file extension


file_extension = os.path.splitext(file_key)[1].lower()
# print(file_extension)
# Load the dataset based on file type
# if file_extension == '.csv':
# dataset = pd.read_csv(s3_object['Body'])
# elif file_extension in ['.xls', '.xlsx']:
# dataset = pd.read_excel(s3_object['Body'])

if file_extension == '.csv':
dataset = pd.read_csv(io.StringIO(s3_object['Body'].read().decode('utf-
8')))
elif file_extension in ['.xls', '.xlsx']:
dataset = pd.read_excel(io.BytesIO(s3_object['Body'].read()))
else:
raise ValueError(f"Unsupported file type: {file_extension}")

# Check if the dataset is empty


if dataset.empty:
raise ValueError(f"Dataset is empty. Please check the file.")

# dataset = pd.read_csv(s3_object['Body'])

# Replace special chars with an empty string


dataset = dataset.replace(to_replace=r'[^a-zA-Z0-9 ]', value='',
regex=True)

# If a column is of dtype 'object', then it's a string & it is stripped of


its whitespaces
dataset = dataset.apply(lambda x: x.str.strip()
if x.dtype == "object" else x)

# Select only the columns provided in the featuresList


if featuresList:
dataset = dataset[featuresList]

dataset = label_encode_data(dataset,target_column)

# Exclude non-numeric columns


dataset = dataset.select_dtypes(include=[float, int])

# Impute missing values with the mean of their column


imputer = SimpleImputer(strategy='mean')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset),
columns=dataset.columns)

return dataset_imputed
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == 'NoSuchKey':
raise ValueError("The specified file does not exist in the S3 bucket.")
elif error_code == 'NoSuchBucket':
raise ValueError("The specified S3 bucket does not exist.")
elif error_code == 'AccessDenied':
raise ValueError("Access denied to the S3 bucket or object.")
else:
# Log full details for unexpected ClientError scenarios
raise ValueError(f"An unexpected error occurred with S3:
{e.response['Error']['Message']}")

except ValueError as e:
# Re-raise ValueError with the same message
raise ValueError(str(e))

except Exception as e:
# Handle other unexpected errors
raise ValueError(f"An unexpected error occurred: {str(e)}")

#adding
# Label encode target column and any non-numeric features
def label_encode_data(df, targetColumn):
# label_encoders = {}
# Convert column names to lowercase for consistency
# df.columns = df.columns.str.strip().str.lower()
targetColumn = targetColumn.strip()

# for col in df.columns:


# if df[col].dtype == 'object' or col == targetColumn:
label_encoder = LabelEncoder()
df[targetColumn] = label_encoder.fit_transform(df[targetColumn])
# label_encoders[targetColumn] = label_encoder

return df #, label_encoders

def preprocess_data(X_train, X_test, y_train, y_test):


# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define preprocessing steps for numeric and categorical features


numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a column transformer


preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)

# Fit on training data and transform both train and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Encode labels for training and testing


# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)

return X_train_processed, X_test_processed, y_train, y_test

# # Train-Test Split with default or custom options


def split_data(df, type,targetColumn, trainSize, randomSeed, shuffleData):

if type=="ratio":

# Check if target variable is numeric


if not pd.api.types.is_numeric_dtype(df[targetColumn]):
raise ValueError(f"Target variable '{targetColumn}' must be numeric")
#commenting for multi
# # Check if target variable has exactly two unique values (binary
classification)
# unique_values = df[targetColumn].nunique()
# if unique_values != 2:
# raise ValueError(f"Target variable '{targetColumn}' must contain
exactly two unique values, but found {unique_values}")

#adding
# Determine if the target variable is binary or multi-class
unique_values = df[targetColumn].nunique()
target_type = 'binary' if unique_values == 2 else 'multi'
logging.info(f"Target variable '{targetColumn}' identified as
'{target_type}' classification "
f"with {unique_values} unique values.")

# Split data into features (X) and target (y)


X = df.drop(columns=[targetColumn])
y = df[targetColumn]

# Specifying train sizes


if trainSize == 'default_70':
trainSize = 70 # Default to 70% training data
elif trainSize == 'default_80':
trainSize = 80 # Default to 80% training data
else:
trainSize = int(trainSize) # Convert the input to an integer

# Ensure trainSize is between 1 and 95


if not (1 <= trainSize <= 95):
raise ValueError("trainSize must be between 1 and 95")

# Calculate test_size based on trainSize


test_size = 100 - trainSize # In percentage

# # Convert shuffle to a boolean


# shuffleData = True if shuffleData == 'True' else False

# Handle random_seed
# if randomSeed == 'True':
if randomSeed == 'true':
randomSeed = 42 # Set to 42 if True
else:
randomSeed = None # Disable randomness

# Perform the train-test split


X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_size / 100,

random_state=randomSeed,
shuffle=shuffleData)
print("train points are",y_train.head(10))
return X_train, X_test, y_train, y_test

# # Hyperparameter handling for default and custom values


def get_base_model_name(modelName):
# Define the base model names and create a regex pattern to capture them
model_patterns = {
"Logistic Regression": r"(?i)\bLogistic\sRegression\b",
"Random Forest": r"(?i)\bRandom\sForest\b",
"Decision Tree": r"(?i)\bDecision\sTree\b",
# "Svm": r"(?i)\bSVM\b|Support\sVector\sMachine\b"
"Svm": r"(?i)\bSvm\b"
}

for model, pattern in model_patterns.items():


if re.search(pattern, modelName): # Match the base model name
return model
return None # Return None if no match found

def get_hyperparams(modelCategory, modelName, trial=None,


parameters=None,target_type='binary'):
hyperparams = {}
param_dict = {}

if parameters:
for key, value in parameters.items():
# Handle different parameter types
if isinstance(value, list):
param_dict[key] = {'list': value}
elif isinstance(value, (int, float)):
if key == 'C':
param_dict[key] = {'range': (value * 0.1, value * 10)}
else:
param_dict[key] = {'value': value}
else:
param_dict[key] = {'value': value}

# Use regular expression to match the core algorithm name


model_match = re.match(r"Logistic\s*Regression|Random\s*Forest|Decision\s*Tree|
Svm", modelName)

if not model_match:
raise ValueError(f"Model name '{modelName}' not recognized.")

# Extract the core model name from the match (in case of a version or extra
words)
modelName_core = model_match.group(0).strip()

if modelCategory == "Classification" and modelName_core == 'Logistic


Regression':
solver_values = ['lbfgs', 'saga', 'liblinear']
selected_solver = trial.suggest_categorical('solver', solver_values)

# Suggest from a complete set of penalties, then validate based on solver


all_penalty_values = ['l1', 'l2', 'elasticnet']
penalty = trial.suggest_categorical('penalty', all_penalty_values)

# Ensure penalty is valid for the selected solver


penalty_values_map = {
'lbfgs': ['l2'],
'saga': ['l1', 'l2', 'elasticnet'],
'liblinear': ['l1', 'l2']
}
if penalty not in penalty_values_map[selected_solver]:
penalty = penalty_values_map[selected_solver][0] # Default to the
first valid penalty if incompatible

# Handle dual parameter explicitly for 'liblinear' + 'l2' penalty


dual = False
if selected_solver == 'liblinear' and penalty == 'l2':
dual = trial.suggest_categorical('dual', [False])

# Determine `multi_class` based on the target type


multi_class = 'ovr' if target_type == 'binary' else 'multinomial'

# Handle other hyperparameters with fallbacks


tol = float(param_dict.get('tol', {}).get('value', 1e-4))
# multi_class = param_dict.get('multi_class', {}).get('value', 'auto')
fit_intercept = bool(param_dict.get('fit_intercept', {}).get('value',
True))
intercept_scaling = float(param_dict.get('intercept_scaling',
{}).get('value', 1.0))
warm_start = bool(param_dict.get('warm_start', {}).get('value', False))

# Define C range with defaults and suggest C value


C_values = param_dict.get('C', {}).get('range', (1e-4, 1e4))
C = trial.suggest_float('C', *C_values)

# class_weight = param_dict.get('class_weight', {}).get('value', None)


class_weight_value = param_dict.get('class_weight',
{}).get('selectedValue', None)
if class_weight_value == "null":
class_weight = None
else:
class_weight = class_weight_value

random_state = param_dict.get("random_state")
if isinstance(random_state, dict) and random_state.get('value') == 'null':
random_state = None
elif random_state is None:
random_state = np.random.RandomState(seed=42)

max_iter = int(param_dict.get('max_iter', {}).get('value', 1000))


verbose = int(param_dict.get('verbose', {}).get('value', 0))

# Define hyperparameter dictionary


hyperparams = {
'penalty': penalty,
'dual': dual,
'tol': tol,
'C': C,
'fit_intercept': fit_intercept,
'intercept_scaling': intercept_scaling,
'class_weight': class_weight,
'random_state': random_state,
'solver': selected_solver,
'max_iter': max_iter,
'multi_class': multi_class,
'verbose': verbose,
'warm_start': warm_start,
}

# Include `l1_ratio` if 'saga' with 'elasticnet', with a default


if selected_solver == 'saga' and penalty == 'elasticnet':
l1_ratio_default = param_dict.get('l1_ratio', {}).get('value', 0.5) #
Default to 0.5
# Suggest l1_ratio value using the default if necessary
l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
# If the suggested value is None (which shouldn't be the case),
fallback to default
if l1_ratio is None:
l1_ratio = l1_ratio_default
hyperparams['l1_ratio'] = l1_ratio
#added
else:
l1_ratio = None # Not applicable otherwise

# Add `n_jobs` if valid


n_jobs = param_dict.get('n_jobs', {}).get('value', 2)
if n_jobs is not None:
try:
n_jobs = int(n_jobs)
hyperparams['n_jobs'] = n_jobs
except ValueError:
pass # Skip adding n_jobs if invalid

# SVM
if modelCategory == "Classification" and modelName_core == 'Svm':
# Define C_values to include a range as a tuple
C_values = param_dict.get('C', {}).get('range', (0.1, 10.0))
kernel_values = param_dict.get('kernel', {}).get('list', ['linear', 'rbf'])

degree = param_dict.get('degree', {}).get('value', 3)


# Convert to integer (if necessary) and ensure it's non-negative
degree = int(degree)
# Check if degree is a valid non-negative integer
if degree < 0:
raise ValueError(f"The 'degree' parameter must be a non-negative
integer. Got '{degree}'.")
# Update the hyperparameters dictionary
hyperparams['degree'] = degree

gamma = param_dict.get('gamma', {}).get('value', 'scale')


coef0 = param_dict.get('coef0', {}).get('value', 0.0)
coef0 = float(coef0)
hyperparams['coef0'] = coef0

shrinking = param_dict.get('shrinking', {}).get('value', True)


shrinking = bool(shrinking)
hyperparams['shrinking'] = shrinking

probability = param_dict.get('probability', {}).get('value', True)


# Convert to boolean if necessary
probability = bool(probability)
# Update the hyperparameters dictionary
hyperparams['probability'] = probability

tol = param_dict.get('tol', {}).get('value', 1e-3)


tol = float(tol)
# Ensure that tol is greater than 0.0
if tol <= 0.0:
raise ValueError(f"The 'tol' parameter must be a float in the range
(0.0, inf). Got '{tol}' instead.")
# Update the hyperparameters dictionary
hyperparams['tol'] = tol

cache_size = param_dict.get('cache_size', {}).get('value', 200)


cache_size = float(cache_size)
# Check if cache_size is valid (should be > 0)
if cache_size <= 0:
raise ValueError(f"The 'cache_size' parameter must be a positive float.
Got '{cache_size}'.")
# Now you can use cache_size in the SVC model
hyperparams['cache_size'] = cache_size

class_weight = param_dict.get('class_weight', {}).get('value', 'balanced')


verbose = param_dict.get('verbose', {}).get('value', 0)

max_iter = param_dict.get('max_iter', {}).get('value', -1)

# Convert to integer (though it should already be an integer)


max_iter = int(max_iter)
# Check if max_iter is valid (-1 or a positive integer)
if max_iter < -1:
raise ValueError(f"The 'max_iter' parameter must be an integer in the
range [-1, inf). Got '{max_iter}'.")
# Update the hyperparameters dictionary
hyperparams['max_iter'] = max_iter
decision_function_shape = param_dict.get('decision_function_shape',
{}).get('value', 'ovr')
break_ties = param_dict.get('break_ties', {}).get('value', False)

# Ensure random_state is either None, an integer, or a


numpy.random.RandomState object
random_state = param_dict.get('random_state', {}).get('value', 42)

# If random_state is a string or float, try to convert it to an integer


if isinstance(random_state, (str, float)):
random_state = int(random_state)

# If random_state is 'null' (string), set it to None


if random_state == "null":
random_state = None

# Ensure random_state is either None or an integer


if random_state is not None and not isinstance(random_state, int):
raise ValueError(f"The 'random_state' parameter must be an integer or
None. Got '{random_state}' instead.")

# Update the hyperparameters dictionary


hyperparams['random_state'] = random_state

# Suggest C value using the defined range


C = trial.suggest_float('C', *C_values)
kernel = trial.suggest_categorical('kernel', kernel_values)

hyperparams = {
'C': C,
'kernel': kernel,
'degree': degree,
'gamma': gamma,
'coef0': coef0,
'shrinking': shrinking,
'probability': probability,
'tol': tol,
'cache_size': cache_size,
'class_weight': class_weight,
'verbose': verbose,
'max_iter': max_iter,
'decision_function_shape': decision_function_shape,
'break_ties': break_ties,
'random_state': random_state
}

# Decision Tree
if modelCategory == "Classification" and modelName_core == 'Decision Tree':
# Criterion values with a default value
criterion_values = param_dict.get('criterion', {}).get('list') or ['gini',
'entropy']
# Splitter values with a default value #added
splitter_values = param_dict.get('splitter', {}).get('list') or ['best',
'random']

# Max depth: accepts a single value, a range, or defaults to (1, 20)


max_depth_param = param_dict.get('max_depth', {})
if 'value' in max_depth_param:
max_depth_value = max_depth_param['value']
max_depth = int(max_depth_value) if isinstance(max_depth_value, (int,
str)) else max_depth_value
else:
max_depth_values = max_depth_param.get('range', (1, 20))
max_depth = trial.suggest_int('max_depth', *max_depth_values)

# Min samples split: single custom value or default to 5


min_samples_split_param = param_dict.get('min_samples_split', {})
#added
if isinstance(min_samples_split_param, dict):
min_samples_split = int(
min_samples_split_param.get('value', 5 if target_type == 'binary'
else 2)
)
else:
min_samples_split = 5 if target_type == 'binary' else 2

# Min samples leaf: single custom value or default to 5


min_samples_leaf_param = param_dict.get('min_samples_leaf', {})
# added
if isinstance(min_samples_leaf_param, dict):
min_samples_leaf = int(
min_samples_leaf_param.get('value', 5 if target_type == 'binary'
else 1)
)
else:
min_samples_leaf = 5 if target_type == 'binary' else 1

# Min weight fraction leaf: single custom value or default to 0.0


min_weight_fraction_leaf_param = param_dict.get('min_weight_fraction_leaf',
{})
min_weight_fraction_leaf =
float(min_weight_fraction_leaf_param.get('value', 0.0))

# Max features: custom value can be a single value or a list, default to


'sqrt'
max_features_param = param_dict.get('max_features', {})
max_features_value = max_features_param.get('value', 'sqrt')
max_features = max_features_value if isinstance(max_features_value, str)
else max_features_value[0]

# Random state: single custom value or default to 42


random_state_param = param_dict.get('random_state', {})
random_state = int(random_state_param.get('value', 42))

# Max leaf nodes: single custom value or default to 100


max_leaf_nodes_param = param_dict.get('max_leaf_nodes', {})
# added
if isinstance(max_leaf_nodes_param, dict):
max_leaf_nodes = max_leaf_nodes_param.get('value', 100 if target_type
== 'binary' else None)
if max_leaf_nodes is not None:
max_leaf_nodes = int(max_leaf_nodes)
else:
max_leaf_nodes = 100 if target_type == 'binary' else None
# Min impurity decrease: single custom value or default to 0.0
min_impurity_decrease_param = param_dict.get('min_impurity_decrease', {})
min_impurity_decrease = min_impurity_decrease_param.get('value', 0.0)

# Class weight: single custom value or default to 'balanced'


class_weight_param = param_dict.get('class_weight', {})
class_weight = class_weight_param.get('value', 'balanced')

# # CCP alpha: single custom value or default to 0.01


# ccp_alpha_param = param_dict.get('ccp_alpha', {})
# ccp_alpha = ccp_alpha_param.get('value', 0.01)

# CCP alpha: accepts a list of custom values or defaults to [0.01]


ccp_alpha_param = param_dict.get('ccp_alpha', {})
# added
ccp_alpha_values = ccp_alpha_param.get('value', [0.01] if target_type ==
'binary' else [0.0]) # Default to [0.01]

# Ensure ccp_alpha_values is a list


if not isinstance(ccp_alpha_values, list):
ccp_alpha_values = [ccp_alpha_values] # Convert to list if it's a
single value

# Suggest a value for ccp_alpha from the list of values


ccp_alpha = float(trial.suggest_categorical('ccp_alpha', ccp_alpha_values))

# Build the hyperparameter dictionary


hyperparams = {
'criterion': trial.suggest_categorical('criterion', criterion_values),
'splitter': trial.suggest_categorical('splitter', splitter_values),
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'min_weight_fraction_leaf': min_weight_fraction_leaf,
'max_features': max_features,
'random_state': random_state,
'max_leaf_nodes': max_leaf_nodes,
'min_impurity_decrease': min_impurity_decrease,
'class_weight': class_weight,
'ccp_alpha': ccp_alpha
}

# elif algorithmCategory == "Classification" and modelName == 'Random Forest':


if modelCategory == "Classification" and modelName_core == 'Random Forest':
# Initialize hyperparams dictionary
hyperparams = {}

# N estimators: default to 100


n_estimators_param = param_dict.get('n_estimators', {})
if isinstance(n_estimators_param, dict):
if 'value' in n_estimators_param:
n_estimators_value = n_estimators_param['value']
hyperparams['n_estimators'] = int(n_estimators_value) if
isinstance(n_estimators_value, (int, str)) else n_estimators_value
else:
hyperparams['n_estimators'] = 100 # Default value
else:
hyperparams['n_estimators'] = trial.suggest_int('n_estimators',
100, 500)

# Criterion values with a default value


criterion_param = param_dict.get('criterion', {})
if isinstance(criterion_param, dict):
criterion_values = criterion_param.get('list', ['gini',
'entropy'])
else:
criterion_values = ['gini', 'entropy']
hyperparams['criterion'] = trial.suggest_categorical('criterion',
criterion_values)

# Max depth: accepts a single value, a range, or defaults to (1,


64)
max_depth_param = param_dict.get('max_depth', {})
if isinstance(max_depth_param, dict):
if 'value' in max_depth_param:
max_depth_value = max_depth_param['value']
hyperparams['max_depth'] = int(max_depth_value) if
isinstance(max_depth_value, (int, str)) else max_depth_value
else:
max_depth_values = max_depth_param.get('range', (1, 64))
hyperparams['max_depth'] = trial.suggest_int('max_depth',
*max_depth_values)
else:
hyperparams['max_depth'] = None # Default value if param_dict
is invalid

# Min samples split: accepts list of values or default to [2]


min_samples_split_param = param_dict.get('min_samples_split', {})
if isinstance(min_samples_split_param, dict):
min_samples_split_values = min_samples_split_param.get('value',
[2])
if not isinstance(min_samples_split_values, list):
min_samples_split_values = [min_samples_split_values]
hyperparams['min_samples_split'] =
int(trial.suggest_categorical('min_samples_split', min_samples_split_values))
else:
hyperparams['min_samples_split'] = 2

# Min samples leaf: accepts list of values or default to [2]


# added
min_samples_leaf_param = param_dict.get('min_samples_leaf', {})
if isinstance(min_samples_leaf_param, dict):
min_samples_leaf_values = min_samples_leaf_param.get(
'value', [2] if target_type == 'binary' else [1]
)
if not isinstance(min_samples_leaf_values, list):
min_samples_leaf_values = [min_samples_leaf_values]
hyperparams['min_samples_leaf'] = int(
trial.suggest_categorical('min_samples_leaf',
min_samples_leaf_values)
)
else:
hyperparams['min_samples_leaf'] = 2 if target_type == 'binary'
else 1
# Min weight fraction leaf: accepts list of values or default to
[0.0]
min_weight_param = param_dict.get('min_weight_fraction_leaf', {})
if isinstance(min_weight_param, dict):
min_weight_values = min_weight_param.get('value', [0.0])
if not isinstance(min_weight_values, list):
min_weight_values = [min_weight_values]
hyperparams['min_weight_fraction_leaf'] =
float(trial.suggest_categorical('min_weight_fraction_leaf', min_weight_values))
else:
hyperparams['min_weight_fraction_leaf'] = 0.0

# Max features: accepts list of values or default to ['sqrt']


max_features_param = param_dict.get('max_features', {})
if isinstance(max_features_param, dict):
max_features_values = max_features_param.get('value', ['sqrt'])
if not isinstance(max_features_values, list):
max_features_values = [max_features_values]
hyperparams['max_features'] =
trial.suggest_categorical('max_features', max_features_values)
else:
hyperparams['max_features'] = 'sqrt'

# Max leaf nodes: accepts list of values or default to None


max_leaf_nodes_param = param_dict.get('max_leaf_nodes', {})
if isinstance(max_leaf_nodes_param, dict):
max_leaf_nodes_values = max_leaf_nodes_param.get('value',
[None])
if not isinstance(max_leaf_nodes_values, list):
max_leaf_nodes_values = [max_leaf_nodes_values]
max_leaf_nodes_value =
trial.suggest_categorical('max_leaf_nodes', max_leaf_nodes_values)
hyperparams['max_leaf_nodes'] = int(max_leaf_nodes_value) if
max_leaf_nodes_value is not None else None
else:
# added
hyperparams['max_leaf_nodes'] = 100 if target_type == 'binary'
else None

# Min impurity decrease: accepts list of values or default to [0.0]


min_impurity_param = param_dict.get('min_impurity_decrease', {})
if isinstance(min_impurity_param, dict):
min_impurity_values = min_impurity_param.get('value', [0.0])
if not isinstance(min_impurity_values, list):
min_impurity_values = [min_impurity_values]
hyperparams['min_impurity_decrease'] =
float(trial.suggest_categorical('min_impurity_decrease', min_impurity_values))
else:
hyperparams['min_impurity_decrease'] = 0.0

# Bootstrap: accepts list of values or default to [True]


bootstrap_param = param_dict.get('bootstrap', {})
if isinstance(bootstrap_param, dict):
bootstrap_values = bootstrap_param.get('value', [True])
if not isinstance(bootstrap_values, list):
bootstrap_values = [bootstrap_values]
hyperparams['bootstrap'] =
trial.suggest_categorical('bootstrap', bootstrap_values)
else:
hyperparams['bootstrap'] = True

# OOB score: accepts list of values or default to [True]


oob_score_param = param_dict.get('oob_score', {})
if isinstance(oob_score_param, dict):
oob_score_values = oob_score_param.get('value', [True])
if not isinstance(oob_score_values, list):
oob_score_values = [oob_score_values]
hyperparams['oob_score'] =
trial.suggest_categorical('oob_score', oob_score_values)
else:
hyperparams['oob_score'] = True

# Random state: accepts list of values or default to [42]


random_state_param = param_dict.get('random_state', {})
if isinstance(random_state_param, dict):
random_state_values = random_state_param.get('value', [42])
if not isinstance(random_state_values, list):
random_state_values = [random_state_values]
hyperparams['random_state'] =
int(trial.suggest_categorical('random_state', random_state_values))
else:
hyperparams['random_state'] = 42

# Class weight: accepts list of values or default to ['balanced']


class_weight_param = param_dict.get('class_weight', {})
if isinstance(class_weight_param, dict):
class_weight_values = class_weight_param.get('value',
['balanced'])
if not isinstance(class_weight_values, list):
class_weight_values = [class_weight_values]
hyperparams['class_weight'] =
trial.suggest_categorical('class_weight', class_weight_values)
else:
hyperparams['class_weight'] = 'balanced'

hyperparams['n_jobs'] = param_dict.get('n_jobs', {}).get('value', -


1)
hyperparams['verbose'] = param_dict.get('verbose', {}).get('value',
0)
hyperparams['warm_start'] = param_dict.get('warm_start',
{}).get('value', False)

return hyperparams

# Objective function with custom values support


def objective(trial, modelCategory, modelName, X_train, y_train, X_test, y_test,
parameters=None, objectiveMetric='accuracy'):
hyperparams = get_hyperparams(modelName, trial, parameters)
# print(f"Trial {trial.number} suggested hyperparameters: {hyperparams}")

# Check algorithm category and apply logic accordingly


if modelCategory == 'Classification':
model = build_model(modelName, hyperparams)

# Train the model


model.fit(X_train, y_train)
# added
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train) # Fit and transform on
training labels
y_test = label_encoder.transform(y_test)

# Predictions
y_pred = model.predict(X_test)

# Select objective metric

if objectiveMetric == 'accuracy':
score = round(accuracy_score(y_test, y_pred), 2)
elif objectiveMetric == 'f1':
score = round(f1_score(y_test, y_pred, average='weighted'), 2)
elif objectiveMetric == 'precision':
score = round(precision_score(y_test, y_pred, average='weighted'), 2)
elif objectiveMetric == 'recall':
score = round(recall_score(y_test, y_pred, average='weighted'), 2)

else:
raise ValueError(f"Unsupported objective metric: {objectiveMetric}")

return score

else:
raise ValueError(f"Unsupported algorithm category: {modelCategory}")

# Function to build the model based on selected algorithm and hyperparameters


def build_model(modelName, hyperparams):
modelName = get_base_model_name(modelName)
if modelName == 'Logistic Regression':
from sklearn.linear_model import LogisticRegression
# return LogisticRegression(**hyperparams)
return LogisticRegression(**(hyperparams or {}))

elif modelName == 'Random Forest':


from sklearn.ensemble import RandomForestClassifier
# return RandomForestClassifier(**hyperparams)
return RandomForestClassifier(**(hyperparams or {}))
elif modelName == 'Svm':
from sklearn.svm import SVC
#model = SVC(probability=True, kernel='rbf', C=1.0, random_state=42)
hyperparams = hyperparams or {}
hyperparams["probability"] = True
return SVC(**hyperparams)

# return SVC(**hyperparams)
#return SVC(**(hyperparams or {})),model

elif modelName == 'Decision Tree':


from sklearn.tree import DecisionTreeClassifier
# return DecisionTreeClassifier(**hyperparams)
return DecisionTreeClassifier(**(hyperparams or {}))
# # Function to run Optuna Hyperparameter tuning & return the best set of
hyperparameters
def run_model(X_train, y_train, X_test, y_test, modelCategory, modelName,
model_params, objective_metric, nTrials, hyperparameterTuning):

# If hyperparameterTuning is False, return None to indicate no tuning


if not hyperparameterTuning:
return None

def objective(trial):
# Get hyperparameters using the updated function
params = get_hyperparams(modelCategory, modelName, trial, model_params)

# Use the existing build_model function


model = build_model(modelName, params)
model.fit(X_train, y_train)

# Use your existing metric calculation logic


y_pred = model.predict(X_test)

if objective_metric == 'accuracy':
score = round(accuracy_score(y_test, y_pred), 2)
elif objective_metric == 'f1':
score = round(f1_score(y_test, y_pred, average='weighted'), 2)
elif objective_metric == 'precision':
score = round(precision_score(y_test, y_pred, average='weighted'), 2)
elif objective_metric == 'recall':
score = round(recall_score(y_test, y_pred, average='weighted'), 2)
else:
score = round(accuracy_score(y_test, y_pred), 2) # default to accuracy

return score

# Create and run the study


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=nTrials)

return study.best_params

def calculate_tp_tn_fp_fn(cm, num_classes):


metrics = {i: {"TP": 0, "TN": 0, "FP": 0, "FN": 0} for i in range(num_classes)}

for i in range(num_classes):
tp = cm[i, i] # True Positives: Diagonal element for class i
fp = cm[:, i].sum() - tp # False Positives: Sum of column i - TP
fn = cm[i, :].sum() - tp # False Negatives: Sum of row i - TP
tn = cm.sum() - (tp + fp + fn) # True Negatives: Total sum - (TP + FP +
FN)

metrics[i]["TP"] = tp
metrics[i]["FP"] = fp
metrics[i]["FN"] = fn
metrics[i]["TN"] = tn

# Add to the dictionary with class key


# metrics[f"Class {i}"] = {
# "TP": tp,
# "TN": tn,
# "FP": fp,
# "FN": fn
# }

return metrics

# Training the model with the best set of parameters & calculate the evaluation
metrics
def train_and_save_metrics(X_train, y_train, X_test, y_test, modelName,
modelCategory, best_params, df, targetColumn,
modelId, hyperparameterTuning, workflowId, trainSize,
projectId, versionId, updatedAt, createdAt, keyMetrics):

# test_size = 100 - trainSize


# trainRatio_per = f"{trainSize}%"
# testRatio_per = f"{test_size}%"
#import pdb;pdb.set_trace()
from sklearn.svm import SVC
# Function to calculate specificity per class
def calculate_specificity(y_true, y_pred, num_classes):
cm = confusion_matrix(y_true, y_pred)
specificity_per_class = []
for i in range(num_classes):
tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
fp = cm[:, i].sum() - cm[i, i]
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
specificity_per_class.append(round(specificity, 2))
return specificity_per_class

# Build the model with the best hyperparameters


model = build_model(modelName, best_params)

# Train the model


model.fit(X_train, y_train)

# Predictions for test data


y_pred_test = model.predict(X_test)
# Predictions for training data
y_pred_train = model.predict(X_train)

# Check if it's binary classification or multi-class classification


is_binary = len(set(y_test)) == 2

# Function to calculate TP, TN, FP, FN for multi-class confusion matrix

# def datatype(obj):
# if isinstance(obj, (np.int64, np.int32)):
# return int(obj)

# Calculate confusion matrix for test data (multi-class classification)


cm_test = confusion_matrix(y_test, y_pred_test)

# Extract TP, TN, FP, FN for each class in the test set
num_classes = len(set(y_test)) # Number of classes in the test set
metrics_test = calculate_tp_tn_fp_fn(cm_test, num_classes)
print("----------", metrics_test)
# Store the confusion matrix and metrics for test data
metrics_test["confusionMatrix"] = cm_test.tolist()
print("----------", metrics_test["confusionMatrix"])

# Calculate confusion matrix for training data (multi-class classification)


cm_train = confusion_matrix(y_train, y_pred_train)

# Extract TP, TN, FP, FN for each class in the train set
metrics_train = calculate_tp_tn_fp_fn(cm_train, num_classes)
#import pdb;pdb.set_trace()
# Store the confusion matrix and metrics for training data
metrics_train["confusionMatrix"] = cm_train.tolist()

# Print the results for test and train data


print("Test Data Metrics:")
for cls, vals in metrics_test.items():
if cls != "confusionMatrix":
print(f"Class {cls}: TP = {vals['TP']}, TN = {vals['TN']}, FP =
{vals['FP']}, FN = {vals['FN']}")

print("\nTrain Data Metrics:")


for cls, vals in metrics_train.items():
if cls != "confusionMatrix":
print(f"Class {cls}: TP = {vals['TP']}, TN = {vals['TN']}, FP =
{vals['FP']}, FN = {vals['FN']}")

metrics_train = calculate_tp_tn_fp_fn(cm_train, num_classes)


metrics_test = calculate_tp_tn_fp_fn(cm_test, num_classes)

#12-11-2024
# Build the model with the best hyperparameters
# model = build_model(modelName, best_params)

# # Train the model


# model.fit(X_train, y_train)

# # Predictions for test data


# y_pred_test = model.predict(X_test)
# # Predictions for training data
# y_pred_train = model.predict(X_train)

# # added
# is_binary = len(set(y_test)) == 2

# # Calculate confusion matrix for test data (binary classification)


# cm_test = confusion_matrix(y_test, y_pred_test)
# #commenting to work for both bina nd multi
# #tn_test, fp_test, fn_test, tp_test = [int(x) for x in cm_test.ravel()]
# #added
# metrics_test = {
# "confusionMatrix": cm_test.tolist()
# # Convert to list for JSON compatibility
# }
# # Calculate confusion matrix for training data (binary classification)
# cm_train = confusion_matrix(y_train, y_pred_train)
# #tn_train, fp_train, fn_train, tp_train = [int(x) for x in cm_train.ravel()]
# #added
# metrics_train = {
# "confusionMatrix": cm_train.tolist()
# }

num_classes = len(set(y_train))
class_labels = [f'class{i}' for i in range(num_classes)]

#added
# Metrics for test data
metrics_test.update({
"accuracyScore": round(float(accuracy_score(y_test, y_pred_test)), 3),
"f1_per_class": [round(f1, 3) for f1 in f1_score(y_test, y_pred_test,
average=None)],
"precision_per_class": [round(prec, 3) for prec in precision_score(y_test,
y_pred_test, average=None)],
"recall_per_class": [round(rec, 3) for rec in recall_score(y_test,
y_pred_test, average=None)],

"specificity_per_class":{class_labels[i]: round(s, 3) for i, s in


enumerate(calculate_specificity(y_test, y_pred_test, num_classes))}

})

# Metrics for training data


metrics_train.update({
"accuracyScore": round(float(accuracy_score(y_train, y_pred_train)), 3),
"f1_per_class": [round(f1, 3) for f1 in f1_score(y_train, y_pred_train,
average=None)],
"precision_per_class": [round(prec, 3) for prec in precision_score(y_train,
y_pred_train, average=None)],
"recall_per_class": [round(rec, 3) for rec in recall_score(y_train,
y_pred_train, average=None)],

"specificity_per_class":{class_labels[i]: round(s, 3) for i, s in


enumerate(calculate_specificity(y_train, y_pred_train, num_classes))}
})

#adding code for hasattr


if hasattr(model, "predict_proba"):
y_prob_test = model.predict_proba(X_test)
y_prob_train = model.predict_proba(X_train)

if is_binary:
y_prob_test_binary = y_prob_test[:, 1]
y_prob_train_binary = y_prob_train[:, 1]
fpr_test, tpr_test,thresholds_test = roc_curve(y_test,
y_prob_test_binary)

valid_thresholds = []
for threshold in thresholds_test:
if np.isfinite(threshold):
valid_thresholds.append(round(float(threshold), 2))

metrics_test["aucRocCurve"] = {
"auc": round(roc_auc_score(y_test, y_prob_test_binary), 2),
"curve": [{"fpr": round(fpr, 2),
"tpr": round(tpr, 2),
"thresholds": threshold}
for fpr, tpr,threshold in zip(fpr_test,
tpr_test,valid_thresholds)][:10]}

else:
from sklearn.preprocessing import label_binarize
# Binarize labels for multi-class
classes = np.unique(np.concatenate([y_train, y_test]))
y_test_binarized = label_binarize(y_test, classes=classes)

metrics_test["aucRocCurve"] = [] # Initialize as a list

auc_scores = []
for i in range(y_prob_test.shape[1]):
auc_test = roc_auc_score(y_test_binarized[:, i], y_prob_test[:, i])
auc_scores.append(auc_test)

fpr_test, tpr_test, thresholds_train =


roc_curve(y_test_binarized[:, i], y_prob_test[:, i])

valid_thresholds = []
for threshold in thresholds_train:
if np.isfinite(threshold):
valid_thresholds.append(round(float(threshold), 2))

metrics_test["aucRocCurve"].append({
"className": f"class_{i}", # Add className
"auc": round(auc_test, 2),
"data": [ # Rename "curve" to "data"
{
"fpr": round(fpr, 2),
"tpr": round(tpr, 2),
"thresholds": threshold
}
for fpr, tpr, threshold in zip(fpr_test, tpr_test,
valid_thresholds)
][:10] # Limit to the first 10 points
})

# Adding average_auc as a summary


average_auc = np.mean(auc_scores)
metrics_test["aucRocCurve"].append({
"className": "average_auc",
"auc": round(average_auc, 2)
})

# # Common Metrics
metrics_train["confusionMatrix"] = confusion_matrix(y_train,
model.predict(X_train)).tolist()
metrics_train["accuracyScore"] = accuracy_score(y_train,
model.predict(X_train))
metrics_train["precision_per_class"] = precision_score(y_train,
model.predict(X_train), average=None) # Per-class precision
metrics_train["f1_per_class"] = f1_score(y_train, model.predict(X_train),
average=None) # Per-class F1 score
metrics_train["recall_per_class"] = recall_score(y_train,
model.predict(X_train), average=None)
metrics_train["specificity_per_class"] = calculate_specificity(y_train,
model.predict(X_train), num_classes) # Per-class recall
# metrics_train["precision"] = precision_score(y_train, model.predict(X_train),
average="weighted")
# metrics_train["f1Score"] = f1_score(y_train, model.predict(X_train),
average="weighted")
# metrics_train["recall"] = recall_score(y_train, model.predict(X_train),
average="weighted")

metrics_test["confusionMatrix"] = confusion_matrix(y_test,
model.predict(X_test)).tolist()
metrics_test["accuracyScore"] = accuracy_score(y_test, model.predict(X_test))
metrics_test["precision_per_class"] = precision_score(y_test,
model.predict(X_test), average=None) # Per-class precision
metrics_test["f1_per_class"] = f1_score(y_test, model.predict(X_test),
average=None) # Per-class F1 score
metrics_test["recall_per_class"] = recall_score(y_test, model.predict(X_test),
average=None) # Per-class recall
metrics_test["specificity_per_class"] = calculate_specificity(y_test,
y_pred_test, num_classes)

# Calculate average metrics for training data


precision_avg_train = np.mean(metrics_train["precision_per_class"])
recall_avg_train = np.mean(metrics_train["recall_per_class"])
f1_avg_train = np.mean(metrics_train["f1_per_class"])
specificity_avg_train = np.mean(metrics_train["specificity_per_class"])

precision_avg_test = np.mean(metrics_test["precision_per_class"])
recall_avg_test = np.mean(metrics_test["recall_per_class"])
f1_avg_test = np.mean(metrics_test["f1_per_class"])
specificity_avg_test = np.mean(metrics_test["specificity_per_class"])

metrics_train.update({
"average_precision": round(precision_avg_train, 3),
"average_recall": round(recall_avg_train, 3),
"average_f1": round(f1_avg_train, 3),
"average_specificity": round(specificity_avg_train, 3)
})
metrics_test.update({
"average_precision": round(precision_avg_test, 3),
"average_recall": round(recall_avg_test, 3),
"average_f1": round(f1_avg_test, 3),
"average_specificity": round(specificity_avg_test, 3)
})

# Update response metrics to include TP, TN, FP, FN per class for train and
test
metrics_test.update({
# "tp_tn_fp_fn_per_class": tp_tn_fp_fn_per_class
"tp_tn_fp_fn_per_class": metrics_test
})

metrics_train.update({
"tp_tn_fp_fn_per_class": metrics_train
})

# metrics_test["precision"] = precision_score(y_test, model.predict(X_test),


average="weighted")
# metrics_test["f1Score"] = f1_score(y_test, model.predict(X_test),
average="weighted")
# metrics_test["recall"] = recall_score(y_test, model.predict(X_test),
average="weighted")
auc_roc_curve = metrics_test.get("aucRocCurve", [])

# Validate that 'aucRocCurve' is a list before iterating


if isinstance(auc_roc_curve, list):
average_auc_entry = next((entry for entry in auc_roc_curve if
isinstance(entry, dict) and entry.get("className") == "average_auc"), None)
else:
average_auc_entry = None

# Extract the 'auc' value or set it to None if not found


average_auc_value = average_auc_entry["auc"] if average_auc_entry else None

# metrics_train = datatype(metrics_train)
# metrics_test = datatype(metrics_test)
# print("--------------------metrics_train--------------",
type(metrics_train["tp_tn_fp_fn_per_class"][0]['TP']))
# if isinstance(metrics_test, (np.int64, np.int32)):
# return int(metrics_test)
# if isinstance(metrics_train, (np.int64, np.int32)):
# return int(metrics_train)

# metrics_train["tp_tn_fp_fn_per_class"][0]
['TP']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]
['TN']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TN'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]
['FP']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['FP'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]
['FN']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['FN'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])

# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['FN'])

# print("--------------------metrics_train--------------",
type(metrics_train["tp_tn_fp_fn_per_class"][0]['TP']))
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])
# train_classes_metrics = {

# f"tp_tn_fp_fn_per_class{i}": {
# "TP": int(metrics_train["tp_tn_fp_fn_per_class"][i]["TP"]),
# "TN": int(metrics_train["tp_tn_fp_fn_per_class"][i]["TN"]),
# "FP": int(metrics_train["tp_tn_fp_fn_per_class"][i]["FP"]),
# "FN": int(metrics_train["tp_tn_fp_fn_per_class"][i]["FN"])
# }
# for i in range(len(metrics_train["tp_tn_fp_fn_per_class"]))
# }

# test_classes_metrics = {
# f"tp_tn_fp_fn_per_class{i}": {
# "TP": int(metrics_test["tp_tn_fp_fn_per_class"][i]["TP"]),
# "TN": int(metrics_test["tp_tn_fp_fn_per_class"][i]["TN"]),
# "FP": int(metrics_test["tp_tn_fp_fn_per_class"][i]["FP"]),
# "FN": int(metrics_test["tp_tn_fp_fn_per_class"][i]["FN"])
# }
# for i in range(len(metrics_test["tp_tn_fp_fn_per_class"]))
# }

# Response format
accuracy_result = {
"trainReport": {
"metrics": [
{
"confusionMatrix": {
"matrix": metrics_train["confusionMatrix"]
},
"tp_tn_fp_fn_per_class0": {
"TP": int(metrics_train["tp_tn_fp_fn_per_class"][0]["TP"]),
"TN": int(metrics_train["tp_tn_fp_fn_per_class"][0]["TN"]),
"FP": int(metrics_train["tp_tn_fp_fn_per_class"][0]["FP"]),
"FN": int(metrics_train["tp_tn_fp_fn_per_class"][0]["FN"])
},
"accuracyScore": metrics_train["accuracyScore"],
# "precision": metrics_train["precision"],
# "f1Score": metrics_train["f1Score"],
# "recall": metrics_train["recall"],
"precision_per_class":
metrics_train["precision_per_class"].tolist(),
"recall_per_class": metrics_train["recall_per_class"].tolist(),
"f1_per_class": metrics_train["f1_per_class"].tolist(),
"specificity_per_class": metrics_train["specificity_per_class"],
#"tp_tn_fp_fn_per_class": metrics_train["tp_tn_fp_fn_per_class"],
"average_precision": metrics_train["average_precision"],
"average_recall": metrics_train["average_recall"],
"average_f1": metrics_train["average_f1"],
"average_specificity": metrics_train["average_specificity"],
"aucRocCurve": metrics_train.get("aucRocCurve", []), #
Only the first 10 values

"bestModel": False,
"modelId": modelId,
"hyperparameterTuning": hyperparameterTuning,
"modelCategory":modelCategory,
"modelName": modelName,

"metadata": {
"note": "Train report, first model"
},
# "_id": "train_id"
}
]
},
"testReport": {
"metrics": [
{
"confusionMatrix": {
"matrix": metrics_test["confusionMatrix"],

},
"tp_tn_fp_fn_per_class0": {
"TP": int(metrics_test["tp_tn_fp_fn_per_class"][0]["TP"]),
"TN": int(metrics_test["tp_tn_fp_fn_per_class"][0]["TN"]),
"FP": int(metrics_test["tp_tn_fp_fn_per_class"][0]["FP"]),
"FN": int(metrics_test["tp_tn_fp_fn_per_class"][0]["FN"])
},
"accuracyScore": metrics_test["accuracyScore"],
# "precision": metrics_test["precision"],
# "f1Score": metrics_test["f1Score"],
# "recall": metrics_test["recall"],
"precision_per_class":
metrics_test["precision_per_class"].tolist(),
"recall_per_class": metrics_test["recall_per_class"].tolist(),
"f1_per_class": metrics_test["f1_per_class"].tolist(),
"specificity_per_class": metrics_test["specificity_per_class"],
# "tp_tn_fp_fn_per_class": metrics_test["tp_tn_fp_fn_per_class"],
"average_precision": metrics_test["average_precision"],
"average_recall": metrics_test["average_recall"],
"average_f1": metrics_test["average_f1"],
"average_specificity": metrics_test["average_specificity"],
"aucRocCurve": metrics_test.get("aucRocCurve", []),
#"average_auc":
metrics_test["aucRocCurve"].get("average_auc"),
"average_auc": average_auc_value,
"bestModel": True,
"modelId": modelId,
"hyperparameterTuning": hyperparameterTuning,
"modelCategory":modelCategory,
"modelName": modelName,

"metadata": {
"note": "Test report, best model"
},
# "_id": "test_id"
}
]
},
# "_id": "6717b977cdd6733dfbd91477",
"workflowId": workflowId,
# "trainRatio": trainRatio_per,
# "testRatio": testRatio_per,
"projectId": projectId,
# "userId": userId,
# "jobId": "650deef95c34b36d9e12d71f",
"versionId": "V1.0", # Hardcoded for now, will be made
dynamic when the need arises
# "dataSources": dataSources,
"keyMetrics": keyMetrics,
"targetColumn": targetColumn,
"createdAt": createdAt,
"updatedAt": updatedAt,
# "__v": 0
}
print("-------------------- Payload defined
Successfully-------------------------------")

## Save results to S3
save_results_to_s3(accuracy_result, modelName)
print("-------------------- 111111111111-------------------------------")

# Return metrics
return accuracy_result

def save_results_to_s3(results, modelName):


# Convert results to JSON
results_json = json.dumps(results)

# Define the S3 object key


object_key = f"model_evaluation.json"

# Upload to S3
s3.put_object(Bucket=S3_BUCKET_NAME, Key=object_key, Body=results_json)

You might also like