0% found this document useful (0 votes)

46 views25 pages

Multi Classification - Py (For 1 Class TP, TN, FP, FN)

Uploaded by

Niharika 480

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

46 views25 pages

Multi Classification - Py (For 1 Class TP, TN, FP, FN)

Uploaded by

Niharika 480

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 25

import boto3

import pandas as pd
import numpy as np
import random
import re
from fastapi.responses import JSONResponse
from fastapi import HTTPException
#added
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.special import softmax
from sklearn.preprocessing import label_binarize

import os
import io
import logging
from logging.handlers import RotatingFileHandler
from fastapi import HTTPException, Response
from fastapi.responses import JSONResponse
from botocore.exceptions import ClientError

from sklearn.model_selection import train_test_split

import optuna
from sklearn.impute import SimpleImputer
from dotenv import load_dotenv
import json
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score,
precision_score, recall_score, roc_auc_score, roc_curve,auc
from sklearn.model_selection import cross_val_predict # To get probabilities for
SVM

from sklearn.exceptions import DataConversionWarning

import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

# Suppress specific warning categories (e.g., DeprecationWarning)

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Suppress warnings related to data conversions in scikit-learn

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

logging.basicConfig(level=logging.INFO)

# Load environment variables

load_dotenv()

# S3 Config
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')
AWS_REGION = os.getenv('AWS_REGION')
S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')

# S3 Client setup
s3 = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_KEY,
region_name=AWS_REGION
)

# Getting the S3 file URL

def get_public_url(file_key):
file_url=f"s3://{S3_BUCKET_NAME}/{file_key}"
return file_url

# # Fetch dataset from S3

def fetch_data_from_s3(file_key, featuresList=None,target_column=None):

try:
s3_object = s3.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)

# Determine the file extension

file_extension = os.path.splitext(file_key)[1].lower()
# print(file_extension)
# Load the dataset based on file type
# if file_extension == '.csv':
# dataset = pd.read_csv(s3_object['Body'])
# elif file_extension in ['.xls', '.xlsx']:
# dataset = pd.read_excel(s3_object['Body'])

if file_extension == '.csv':
dataset = pd.read_csv(io.StringIO(s3_object['Body'].read().decode('utf-
8')))
elif file_extension in ['.xls', '.xlsx']:
dataset = pd.read_excel(io.BytesIO(s3_object['Body'].read()))
else:
raise ValueError(f"Unsupported file type: {file_extension}")

# Check if the dataset is empty

if dataset.empty:
raise ValueError(f"Dataset is empty. Please check the file.")

# dataset = pd.read_csv(s3_object['Body'])

# Replace special chars with an empty string

dataset = dataset.replace(to_replace=r'[^a-zA-Z0-9 ]', value='',
regex=True)

# If a column is of dtype 'object', then it's a string & it is stripped of

its whitespaces
dataset = dataset.apply(lambda x: x.str.strip()
if x.dtype == "object" else x)

# Select only the columns provided in the featuresList

if featuresList:
dataset = dataset[featuresList]

dataset = label_encode_data(dataset,target_column)

# Exclude non-numeric columns

dataset = dataset.select_dtypes(include=[float, int])

# Impute missing values with the mean of their column

imputer = SimpleImputer(strategy='mean')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset),
columns=dataset.columns)

return dataset_imputed
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == 'NoSuchKey':
raise ValueError("The specified file does not exist in the S3 bucket.")
elif error_code == 'NoSuchBucket':
raise ValueError("The specified S3 bucket does not exist.")
elif error_code == 'AccessDenied':
raise ValueError("Access denied to the S3 bucket or object.")
else:
# Log full details for unexpected ClientError scenarios
raise ValueError(f"An unexpected error occurred with S3:
{e.response['Error']['Message']}")

except ValueError as e:
# Re-raise ValueError with the same message
raise ValueError(str(e))

except Exception as e:
# Handle other unexpected errors
raise ValueError(f"An unexpected error occurred: {str(e)}")

#adding
# Label encode target column and any non-numeric features
def label_encode_data(df, targetColumn):
# label_encoders = {}
# Convert column names to lowercase for consistency
# df.columns = df.columns.str.strip().str.lower()
targetColumn = targetColumn.strip()

# for col in df.columns:

# if df[col].dtype == 'object' or col == targetColumn:
label_encoder = LabelEncoder()
df[targetColumn] = label_encoder.fit_transform(df[targetColumn])
# label_encoders[targetColumn] = label_encoder

return df #, label_encoders

def preprocess_data(X_train, X_test, y_train, y_test):

# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define preprocessing steps for numeric and categorical features

numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a column transformer

preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)

# Fit on training data and transform both train and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Encode labels for training and testing

# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)

return X_train_processed, X_test_processed, y_train, y_test

# # Train-Test Split with default or custom options

def split_data(df, type,targetColumn, trainSize, randomSeed, shuffleData):

if type=="ratio":

# Check if target variable is numeric

if not pd.api.types.is_numeric_dtype(df[targetColumn]):
raise ValueError(f"Target variable '{targetColumn}' must be numeric")
#commenting for multi
# # Check if target variable has exactly two unique values (binary
classification)
# unique_values = df[targetColumn].nunique()
# if unique_values != 2:
# raise ValueError(f"Target variable '{targetColumn}' must contain
exactly two unique values, but found {unique_values}")

#adding
# Determine if the target variable is binary or multi-class
unique_values = df[targetColumn].nunique()
target_type = 'binary' if unique_values == 2 else 'multi'
logging.info(f"Target variable '{targetColumn}' identified as
'{target_type}' classification "
f"with {unique_values} unique values.")

# Split data into features (X) and target (y)

X = df.drop(columns=[targetColumn])
y = df[targetColumn]

# Specifying train sizes

if trainSize == 'default_70':
trainSize = 70 # Default to 70% training data
elif trainSize == 'default_80':
trainSize = 80 # Default to 80% training data
else:
trainSize = int(trainSize) # Convert the input to an integer

# Ensure trainSize is between 1 and 95

if not (1 <= trainSize <= 95):
raise ValueError("trainSize must be between 1 and 95")

# Calculate test_size based on trainSize

test_size = 100 - trainSize # In percentage

# # Convert shuffle to a boolean

# shuffleData = True if shuffleData == 'True' else False

# Handle random_seed
# if randomSeed == 'True':
if randomSeed == 'true':
randomSeed = 42 # Set to 42 if True
else:
randomSeed = None # Disable randomness

# Perform the train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_size / 100,

random_state=randomSeed,
shuffle=shuffleData)
print("train points are",y_train.head(10))
return X_train, X_test, y_train, y_test

# # Hyperparameter handling for default and custom values

def get_base_model_name(modelName):
# Define the base model names and create a regex pattern to capture them
model_patterns = {
"Logistic Regression": r"(?i)\bLogistic\sRegression\b",
"Random Forest": r"(?i)\bRandom\sForest\b",
"Decision Tree": r"(?i)\bDecision\sTree\b",
# "Svm": r"(?i)\bSVM\b|Support\sVector\sMachine\b"
"Svm": r"(?i)\bSvm\b"
}

for model, pattern in model_patterns.items():

if re.search(pattern, modelName): # Match the base model name
return model
return None # Return None if no match found

def get_hyperparams(modelCategory, modelName, trial=None,

parameters=None,target_type='binary'):
hyperparams = {}
param_dict = {}

if parameters:
for key, value in parameters.items():
# Handle different parameter types
if isinstance(value, list):
param_dict[key] = {'list': value}
elif isinstance(value, (int, float)):
if key == 'C':
param_dict[key] = {'range': (value * 0.1, value * 10)}
else:
param_dict[key] = {'value': value}
else:
param_dict[key] = {'value': value}

# Use regular expression to match the core algorithm name

model_match = re.match(r"Logistic\s*Regression|Random\s*Forest|Decision\s*Tree|
Svm", modelName)

if not model_match:
raise ValueError(f"Model name '{modelName}' not recognized.")

# Extract the core model name from the match (in case of a version or extra
words)
modelName_core = model_match.group(0).strip()

if modelCategory == "Classification" and modelName_core == 'Logistic

Regression':
solver_values = ['lbfgs', 'saga', 'liblinear']
selected_solver = trial.suggest_categorical('solver', solver_values)

# Suggest from a complete set of penalties, then validate based on solver

all_penalty_values = ['l1', 'l2', 'elasticnet']
penalty = trial.suggest_categorical('penalty', all_penalty_values)

# Ensure penalty is valid for the selected solver

penalty_values_map = {
'lbfgs': ['l2'],
'saga': ['l1', 'l2', 'elasticnet'],
'liblinear': ['l1', 'l2']
}
if penalty not in penalty_values_map[selected_solver]:
penalty = penalty_values_map[selected_solver][0] # Default to the
first valid penalty if incompatible

# Handle dual parameter explicitly for 'liblinear' + 'l2' penalty

dual = False
if selected_solver == 'liblinear' and penalty == 'l2':
dual = trial.suggest_categorical('dual', [False])

# Determine `multi_class` based on the target type

multi_class = 'ovr' if target_type == 'binary' else 'multinomial'

# Handle other hyperparameters with fallbacks

tol = float(param_dict.get('tol', {}).get('value', 1e-4))
# multi_class = param_dict.get('multi_class', {}).get('value', 'auto')
fit_intercept = bool(param_dict.get('fit_intercept', {}).get('value',
True))
intercept_scaling = float(param_dict.get('intercept_scaling',
{}).get('value', 1.0))
warm_start = bool(param_dict.get('warm_start', {}).get('value', False))

# Define C range with defaults and suggest C value

C_values = param_dict.get('C', {}).get('range', (1e-4, 1e4))
C = trial.suggest_float('C', *C_values)

# class_weight = param_dict.get('class_weight', {}).get('value', None)

class_weight_value = param_dict.get('class_weight',
{}).get('selectedValue', None)
if class_weight_value == "null":
class_weight = None
else:
class_weight = class_weight_value

random_state = param_dict.get("random_state")
if isinstance(random_state, dict) and random_state.get('value') == 'null':
random_state = None
elif random_state is None:
random_state = np.random.RandomState(seed=42)

max_iter = int(param_dict.get('max_iter', {}).get('value', 1000))

verbose = int(param_dict.get('verbose', {}).get('value', 0))

# Define hyperparameter dictionary

hyperparams = {
'penalty': penalty,
'dual': dual,
'tol': tol,
'C': C,
'fit_intercept': fit_intercept,
'intercept_scaling': intercept_scaling,
'class_weight': class_weight,
'random_state': random_state,
'solver': selected_solver,
'max_iter': max_iter,
'multi_class': multi_class,
'verbose': verbose,
'warm_start': warm_start,
}

# Include `l1_ratio` if 'saga' with 'elasticnet', with a default

if selected_solver == 'saga' and penalty == 'elasticnet':
l1_ratio_default = param_dict.get('l1_ratio', {}).get('value', 0.5) #
Default to 0.5
# Suggest l1_ratio value using the default if necessary
l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
# If the suggested value is None (which shouldn't be the case),
fallback to default
if l1_ratio is None:
l1_ratio = l1_ratio_default
hyperparams['l1_ratio'] = l1_ratio
#added
else:
l1_ratio = None # Not applicable otherwise

# Add `n_jobs` if valid

n_jobs = param_dict.get('n_jobs', {}).get('value', 2)
if n_jobs is not None:
try:
n_jobs = int(n_jobs)
hyperparams['n_jobs'] = n_jobs
except ValueError:
pass # Skip adding n_jobs if invalid

# SVM
if modelCategory == "Classification" and modelName_core == 'Svm':
# Define C_values to include a range as a tuple
C_values = param_dict.get('C', {}).get('range', (0.1, 10.0))
kernel_values = param_dict.get('kernel', {}).get('list', ['linear', 'rbf'])

degree = param_dict.get('degree', {}).get('value', 3)

# Convert to integer (if necessary) and ensure it's non-negative
degree = int(degree)
# Check if degree is a valid non-negative integer
if degree < 0:
raise ValueError(f"The 'degree' parameter must be a non-negative
integer. Got '{degree}'.")
# Update the hyperparameters dictionary
hyperparams['degree'] = degree

gamma = param_dict.get('gamma', {}).get('value', 'scale')

coef0 = param_dict.get('coef0', {}).get('value', 0.0)
coef0 = float(coef0)
hyperparams['coef0'] = coef0

shrinking = param_dict.get('shrinking', {}).get('value', True)

shrinking = bool(shrinking)
hyperparams['shrinking'] = shrinking

probability = param_dict.get('probability', {}).get('value', True)

# Convert to boolean if necessary
probability = bool(probability)
# Update the hyperparameters dictionary
hyperparams['probability'] = probability

tol = param_dict.get('tol', {}).get('value', 1e-3)

tol = float(tol)
# Ensure that tol is greater than 0.0
if tol <= 0.0:
raise ValueError(f"The 'tol' parameter must be a float in the range
(0.0, inf). Got '{tol}' instead.")
# Update the hyperparameters dictionary
hyperparams['tol'] = tol

cache_size = param_dict.get('cache_size', {}).get('value', 200)

cache_size = float(cache_size)
# Check if cache_size is valid (should be > 0)
if cache_size <= 0:
raise ValueError(f"The 'cache_size' parameter must be a positive float.
Got '{cache_size}'.")
# Now you can use cache_size in the SVC model
hyperparams['cache_size'] = cache_size

class_weight = param_dict.get('class_weight', {}).get('value', 'balanced')

verbose = param_dict.get('verbose', {}).get('value', 0)

max_iter = param_dict.get('max_iter', {}).get('value', -1)

# Convert to integer (though it should already be an integer)

max_iter = int(max_iter)
# Check if max_iter is valid (-1 or a positive integer)
if max_iter < -1:
raise ValueError(f"The 'max_iter' parameter must be an integer in the
range [-1, inf). Got '{max_iter}'.")
# Update the hyperparameters dictionary
hyperparams['max_iter'] = max_iter
decision_function_shape = param_dict.get('decision_function_shape',
{}).get('value', 'ovr')
break_ties = param_dict.get('break_ties', {}).get('value', False)

# Ensure random_state is either None, an integer, or a

numpy.random.RandomState object
random_state = param_dict.get('random_state', {}).get('value', 42)

# If random_state is a string or float, try to convert it to an integer

if isinstance(random_state, (str, float)):
random_state = int(random_state)

# If random_state is 'null' (string), set it to None

if random_state == "null":
random_state = None

# Ensure random_state is either None or an integer

if random_state is not None and not isinstance(random_state, int):
raise ValueError(f"The 'random_state' parameter must be an integer or
None. Got '{random_state}' instead.")

# Update the hyperparameters dictionary

hyperparams['random_state'] = random_state

# Suggest C value using the defined range

C = trial.suggest_float('C', *C_values)
kernel = trial.suggest_categorical('kernel', kernel_values)

hyperparams = {
'C': C,
'kernel': kernel,
'degree': degree,
'gamma': gamma,
'coef0': coef0,
'shrinking': shrinking,
'probability': probability,
'tol': tol,
'cache_size': cache_size,
'class_weight': class_weight,
'verbose': verbose,
'max_iter': max_iter,
'decision_function_shape': decision_function_shape,
'break_ties': break_ties,
'random_state': random_state
}

# Decision Tree
if modelCategory == "Classification" and modelName_core == 'Decision Tree':
# Criterion values with a default value
criterion_values = param_dict.get('criterion', {}).get('list') or ['gini',
'entropy']
# Splitter values with a default value #added
splitter_values = param_dict.get('splitter', {}).get('list') or ['best',
'random']

# Max depth: accepts a single value, a range, or defaults to (1, 20)

max_depth_param = param_dict.get('max_depth', {})
if 'value' in max_depth_param:
max_depth_value = max_depth_param['value']
max_depth = int(max_depth_value) if isinstance(max_depth_value, (int,
str)) else max_depth_value
else:
max_depth_values = max_depth_param.get('range', (1, 20))
max_depth = trial.suggest_int('max_depth', *max_depth_values)

# Min samples split: single custom value or default to 5

min_samples_split_param = param_dict.get('min_samples_split', {})
#added
if isinstance(min_samples_split_param, dict):
min_samples_split = int(
min_samples_split_param.get('value', 5 if target_type == 'binary'
else 2)
)
else:
min_samples_split = 5 if target_type == 'binary' else 2

# Min samples leaf: single custom value or default to 5

min_samples_leaf_param = param_dict.get('min_samples_leaf', {})
# added
if isinstance(min_samples_leaf_param, dict):
min_samples_leaf = int(
min_samples_leaf_param.get('value', 5 if target_type == 'binary'
else 1)
)
else:
min_samples_leaf = 5 if target_type == 'binary' else 1

# Min weight fraction leaf: single custom value or default to 0.0

min_weight_fraction_leaf_param = param_dict.get('min_weight_fraction_leaf',
{})
min_weight_fraction_leaf =
float(min_weight_fraction_leaf_param.get('value', 0.0))

# Max features: custom value can be a single value or a list, default to

'sqrt'
max_features_param = param_dict.get('max_features', {})
max_features_value = max_features_param.get('value', 'sqrt')
max_features = max_features_value if isinstance(max_features_value, str)
else max_features_value[0]

# Random state: single custom value or default to 42

random_state_param = param_dict.get('random_state', {})
random_state = int(random_state_param.get('value', 42))

# Max leaf nodes: single custom value or default to 100

max_leaf_nodes_param = param_dict.get('max_leaf_nodes', {})
# added
if isinstance(max_leaf_nodes_param, dict):
max_leaf_nodes = max_leaf_nodes_param.get('value', 100 if target_type
== 'binary' else None)
if max_leaf_nodes is not None:
max_leaf_nodes = int(max_leaf_nodes)
else:
max_leaf_nodes = 100 if target_type == 'binary' else None
# Min impurity decrease: single custom value or default to 0.0
min_impurity_decrease_param = param_dict.get('min_impurity_decrease', {})
min_impurity_decrease = min_impurity_decrease_param.get('value', 0.0)

# Class weight: single custom value or default to 'balanced'

class_weight_param = param_dict.get('class_weight', {})
class_weight = class_weight_param.get('value', 'balanced')

# # CCP alpha: single custom value or default to 0.01

# ccp_alpha_param = param_dict.get('ccp_alpha', {})
# ccp_alpha = ccp_alpha_param.get('value', 0.01)

# CCP alpha: accepts a list of custom values or defaults to [0.01]

ccp_alpha_param = param_dict.get('ccp_alpha', {})
# added
ccp_alpha_values = ccp_alpha_param.get('value', [0.01] if target_type ==
'binary' else [0.0]) # Default to [0.01]

# Ensure ccp_alpha_values is a list

if not isinstance(ccp_alpha_values, list):
ccp_alpha_values = [ccp_alpha_values] # Convert to list if it's a
single value

# Suggest a value for ccp_alpha from the list of values

ccp_alpha = float(trial.suggest_categorical('ccp_alpha', ccp_alpha_values))

# Build the hyperparameter dictionary

hyperparams = {
'criterion': trial.suggest_categorical('criterion', criterion_values),
'splitter': trial.suggest_categorical('splitter', splitter_values),
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'min_weight_fraction_leaf': min_weight_fraction_leaf,
'max_features': max_features,
'random_state': random_state,
'max_leaf_nodes': max_leaf_nodes,
'min_impurity_decrease': min_impurity_decrease,
'class_weight': class_weight,
'ccp_alpha': ccp_alpha
}

# elif algorithmCategory == "Classification" and modelName == 'Random Forest':

if modelCategory == "Classification" and modelName_core == 'Random Forest':
# Initialize hyperparams dictionary
hyperparams = {}

# N estimators: default to 100

n_estimators_param = param_dict.get('n_estimators', {})
if isinstance(n_estimators_param, dict):
if 'value' in n_estimators_param:
n_estimators_value = n_estimators_param['value']
hyperparams['n_estimators'] = int(n_estimators_value) if
isinstance(n_estimators_value, (int, str)) else n_estimators_value
else:
hyperparams['n_estimators'] = 100 # Default value
else:
hyperparams['n_estimators'] = trial.suggest_int('n_estimators',
100, 500)

# Criterion values with a default value

criterion_param = param_dict.get('criterion', {})
if isinstance(criterion_param, dict):
criterion_values = criterion_param.get('list', ['gini',
'entropy'])
else:
criterion_values = ['gini', 'entropy']
hyperparams['criterion'] = trial.suggest_categorical('criterion',
criterion_values)

# Max depth: accepts a single value, a range, or defaults to (1,

64)
max_depth_param = param_dict.get('max_depth', {})
if isinstance(max_depth_param, dict):
if 'value' in max_depth_param:
max_depth_value = max_depth_param['value']
hyperparams['max_depth'] = int(max_depth_value) if
isinstance(max_depth_value, (int, str)) else max_depth_value
else:
max_depth_values = max_depth_param.get('range', (1, 64))
hyperparams['max_depth'] = trial.suggest_int('max_depth',
*max_depth_values)
else:
hyperparams['max_depth'] = None # Default value if param_dict
is invalid

# Min samples split: accepts list of values or default to [2]

min_samples_split_param = param_dict.get('min_samples_split', {})
if isinstance(min_samples_split_param, dict):
min_samples_split_values = min_samples_split_param.get('value',
[2])
if not isinstance(min_samples_split_values, list):
min_samples_split_values = [min_samples_split_values]
hyperparams['min_samples_split'] =
int(trial.suggest_categorical('min_samples_split', min_samples_split_values))
else:
hyperparams['min_samples_split'] = 2

# Min samples leaf: accepts list of values or default to [2]

# added
min_samples_leaf_param = param_dict.get('min_samples_leaf', {})
if isinstance(min_samples_leaf_param, dict):
min_samples_leaf_values = min_samples_leaf_param.get(
'value', [2] if target_type == 'binary' else [1]
)
if not isinstance(min_samples_leaf_values, list):
min_samples_leaf_values = [min_samples_leaf_values]
hyperparams['min_samples_leaf'] = int(
trial.suggest_categorical('min_samples_leaf',
min_samples_leaf_values)
)
else:
hyperparams['min_samples_leaf'] = 2 if target_type == 'binary'
else 1
# Min weight fraction leaf: accepts list of values or default to
[0.0]
min_weight_param = param_dict.get('min_weight_fraction_leaf', {})
if isinstance(min_weight_param, dict):
min_weight_values = min_weight_param.get('value', [0.0])
if not isinstance(min_weight_values, list):
min_weight_values = [min_weight_values]
hyperparams['min_weight_fraction_leaf'] =
float(trial.suggest_categorical('min_weight_fraction_leaf', min_weight_values))
else:
hyperparams['min_weight_fraction_leaf'] = 0.0

# Max features: accepts list of values or default to ['sqrt']

max_features_param = param_dict.get('max_features', {})
if isinstance(max_features_param, dict):
max_features_values = max_features_param.get('value', ['sqrt'])
if not isinstance(max_features_values, list):
max_features_values = [max_features_values]
hyperparams['max_features'] =
trial.suggest_categorical('max_features', max_features_values)
else:
hyperparams['max_features'] = 'sqrt'

# Max leaf nodes: accepts list of values or default to None

max_leaf_nodes_param = param_dict.get('max_leaf_nodes', {})
if isinstance(max_leaf_nodes_param, dict):
max_leaf_nodes_values = max_leaf_nodes_param.get('value',
[None])
if not isinstance(max_leaf_nodes_values, list):
max_leaf_nodes_values = [max_leaf_nodes_values]
max_leaf_nodes_value =
trial.suggest_categorical('max_leaf_nodes', max_leaf_nodes_values)
hyperparams['max_leaf_nodes'] = int(max_leaf_nodes_value) if
max_leaf_nodes_value is not None else None
else:
# added
hyperparams['max_leaf_nodes'] = 100 if target_type == 'binary'
else None

# Min impurity decrease: accepts list of values or default to [0.0]

min_impurity_param = param_dict.get('min_impurity_decrease', {})
if isinstance(min_impurity_param, dict):
min_impurity_values = min_impurity_param.get('value', [0.0])
if not isinstance(min_impurity_values, list):
min_impurity_values = [min_impurity_values]
hyperparams['min_impurity_decrease'] =
float(trial.suggest_categorical('min_impurity_decrease', min_impurity_values))
else:
hyperparams['min_impurity_decrease'] = 0.0

# Bootstrap: accepts list of values or default to [True]

bootstrap_param = param_dict.get('bootstrap', {})
if isinstance(bootstrap_param, dict):
bootstrap_values = bootstrap_param.get('value', [True])
if not isinstance(bootstrap_values, list):
bootstrap_values = [bootstrap_values]
hyperparams['bootstrap'] =
trial.suggest_categorical('bootstrap', bootstrap_values)
else:
hyperparams['bootstrap'] = True

# OOB score: accepts list of values or default to [True]

oob_score_param = param_dict.get('oob_score', {})
if isinstance(oob_score_param, dict):
oob_score_values = oob_score_param.get('value', [True])
if not isinstance(oob_score_values, list):
oob_score_values = [oob_score_values]
hyperparams['oob_score'] =
trial.suggest_categorical('oob_score', oob_score_values)
else:
hyperparams['oob_score'] = True

# Random state: accepts list of values or default to [42]

random_state_param = param_dict.get('random_state', {})
if isinstance(random_state_param, dict):
random_state_values = random_state_param.get('value', [42])
if not isinstance(random_state_values, list):
random_state_values = [random_state_values]
hyperparams['random_state'] =
int(trial.suggest_categorical('random_state', random_state_values))
else:
hyperparams['random_state'] = 42

# Class weight: accepts list of values or default to ['balanced']

class_weight_param = param_dict.get('class_weight', {})
if isinstance(class_weight_param, dict):
class_weight_values = class_weight_param.get('value',
['balanced'])
if not isinstance(class_weight_values, list):
class_weight_values = [class_weight_values]
hyperparams['class_weight'] =
trial.suggest_categorical('class_weight', class_weight_values)
else:
hyperparams['class_weight'] = 'balanced'

hyperparams['n_jobs'] = param_dict.get('n_jobs', {}).get('value', -

1)
hyperparams['verbose'] = param_dict.get('verbose', {}).get('value',
0)
hyperparams['warm_start'] = param_dict.get('warm_start',
{}).get('value', False)

return hyperparams

# Objective function with custom values support

def objective(trial, modelCategory, modelName, X_train, y_train, X_test, y_test,
parameters=None, objectiveMetric='accuracy'):
hyperparams = get_hyperparams(modelName, trial, parameters)
# print(f"Trial {trial.number} suggested hyperparameters: {hyperparams}")

# Check algorithm category and apply logic accordingly

if modelCategory == 'Classification':
model = build_model(modelName, hyperparams)

# Train the model

model.fit(X_train, y_train)
# added
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train) # Fit and transform on
training labels
y_test = label_encoder.transform(y_test)

# Predictions
y_pred = model.predict(X_test)

# Select objective metric

if objectiveMetric == 'accuracy':
score = round(accuracy_score(y_test, y_pred), 2)
elif objectiveMetric == 'f1':
score = round(f1_score(y_test, y_pred, average='weighted'), 2)
elif objectiveMetric == 'precision':
score = round(precision_score(y_test, y_pred, average='weighted'), 2)
elif objectiveMetric == 'recall':
score = round(recall_score(y_test, y_pred, average='weighted'), 2)

else:
raise ValueError(f"Unsupported objective metric: {objectiveMetric}")

return score

else:
raise ValueError(f"Unsupported algorithm category: {modelCategory}")

# Function to build the model based on selected algorithm and hyperparameters

def build_model(modelName, hyperparams):
modelName = get_base_model_name(modelName)
if modelName == 'Logistic Regression':
from sklearn.linear_model import LogisticRegression
# return LogisticRegression(**hyperparams)
return LogisticRegression(**(hyperparams or {}))

elif modelName == 'Random Forest':

from sklearn.ensemble import RandomForestClassifier
# return RandomForestClassifier(**hyperparams)
return RandomForestClassifier(**(hyperparams or {}))
elif modelName == 'Svm':
from sklearn.svm import SVC
#model = SVC(probability=True, kernel='rbf', C=1.0, random_state=42)
hyperparams = hyperparams or {}
hyperparams["probability"] = True
return SVC(**hyperparams)

# return SVC(**hyperparams)
#return SVC(**(hyperparams or {})),model

elif modelName == 'Decision Tree':

from sklearn.tree import DecisionTreeClassifier
# return DecisionTreeClassifier(**hyperparams)
return DecisionTreeClassifier(**(hyperparams or {}))
# # Function to run Optuna Hyperparameter tuning & return the best set of
hyperparameters
def run_model(X_train, y_train, X_test, y_test, modelCategory, modelName,
model_params, objective_metric, nTrials, hyperparameterTuning):

# If hyperparameterTuning is False, return None to indicate no tuning

if not hyperparameterTuning:
return None

def objective(trial):
# Get hyperparameters using the updated function
params = get_hyperparams(modelCategory, modelName, trial, model_params)

# Use the existing build_model function

model = build_model(modelName, params)
model.fit(X_train, y_train)

# Use your existing metric calculation logic

y_pred = model.predict(X_test)

if objective_metric == 'accuracy':
score = round(accuracy_score(y_test, y_pred), 2)
elif objective_metric == 'f1':
score = round(f1_score(y_test, y_pred, average='weighted'), 2)
elif objective_metric == 'precision':
score = round(precision_score(y_test, y_pred, average='weighted'), 2)
elif objective_metric == 'recall':
score = round(recall_score(y_test, y_pred, average='weighted'), 2)
else:
score = round(accuracy_score(y_test, y_pred), 2) # default to accuracy

return score

# Create and run the study

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=nTrials)

return study.best_params

def calculate_tp_tn_fp_fn(cm, num_classes):

metrics = {i: {"TP": 0, "TN": 0, "FP": 0, "FN": 0} for i in range(num_classes)}

for i in range(num_classes):
tp = cm[i, i] # True Positives: Diagonal element for class i
fp = cm[:, i].sum() - tp # False Positives: Sum of column i - TP
fn = cm[i, :].sum() - tp # False Negatives: Sum of row i - TP
tn = cm.sum() - (tp + fp + fn) # True Negatives: Total sum - (TP + FP +
FN)

metrics[i]["TP"] = tp
metrics[i]["FP"] = fp
metrics[i]["FN"] = fn
metrics[i]["TN"] = tn

# Add to the dictionary with class key

# metrics[f"Class {i}"] = {
# "TP": tp,
# "TN": tn,
# "FP": fp,
# "FN": fn
# }

return metrics

# Training the model with the best set of parameters & calculate the evaluation
metrics
def train_and_save_metrics(X_train, y_train, X_test, y_test, modelName,
modelCategory, best_params, df, targetColumn,
modelId, hyperparameterTuning, workflowId, trainSize,
projectId, versionId, updatedAt, createdAt, keyMetrics):

# test_size = 100 - trainSize

# trainRatio_per = f"{trainSize}%"
# testRatio_per = f"{test_size}%"
#import pdb;pdb.set_trace()
from sklearn.svm import SVC
# Function to calculate specificity per class
def calculate_specificity(y_true, y_pred, num_classes):
cm = confusion_matrix(y_true, y_pred)
specificity_per_class = []
for i in range(num_classes):
tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
fp = cm[:, i].sum() - cm[i, i]
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
specificity_per_class.append(round(specificity, 2))
return specificity_per_class

# Build the model with the best hyperparameters

model = build_model(modelName, best_params)

# Train the model

model.fit(X_train, y_train)

# Predictions for test data

y_pred_test = model.predict(X_test)
# Predictions for training data
y_pred_train = model.predict(X_train)

# Check if it's binary classification or multi-class classification

is_binary = len(set(y_test)) == 2

# Function to calculate TP, TN, FP, FN for multi-class confusion matrix

# def datatype(obj):
# if isinstance(obj, (np.int64, np.int32)):
# return int(obj)

# Calculate confusion matrix for test data (multi-class classification)

cm_test = confusion_matrix(y_test, y_pred_test)

# Extract TP, TN, FP, FN for each class in the test set
num_classes = len(set(y_test)) # Number of classes in the test set
metrics_test = calculate_tp_tn_fp_fn(cm_test, num_classes)
print("----------", metrics_test)
# Store the confusion matrix and metrics for test data
metrics_test["confusionMatrix"] = cm_test.tolist()
print("----------", metrics_test["confusionMatrix"])

# Calculate confusion matrix for training data (multi-class classification)

cm_train = confusion_matrix(y_train, y_pred_train)

# Extract TP, TN, FP, FN for each class in the train set
metrics_train = calculate_tp_tn_fp_fn(cm_train, num_classes)
#import pdb;pdb.set_trace()
# Store the confusion matrix and metrics for training data
metrics_train["confusionMatrix"] = cm_train.tolist()

# Print the results for test and train data

print("Test Data Metrics:")
for cls, vals in metrics_test.items():
if cls != "confusionMatrix":
print(f"Class {cls}: TP = {vals['TP']}, TN = {vals['TN']}, FP =
{vals['FP']}, FN = {vals['FN']}")

print("\nTrain Data Metrics:")

for cls, vals in metrics_train.items():
if cls != "confusionMatrix":
print(f"Class {cls}: TP = {vals['TP']}, TN = {vals['TN']}, FP =
{vals['FP']}, FN = {vals['FN']}")

metrics_train = calculate_tp_tn_fp_fn(cm_train, num_classes)

metrics_test = calculate_tp_tn_fp_fn(cm_test, num_classes)

#12-11-2024
# Build the model with the best hyperparameters
# model = build_model(modelName, best_params)

# # Train the model

# model.fit(X_train, y_train)

# # Predictions for test data

# y_pred_test = model.predict(X_test)
# # Predictions for training data
# y_pred_train = model.predict(X_train)

# # added
# is_binary = len(set(y_test)) == 2

# # Calculate confusion matrix for test data (binary classification)

# cm_test = confusion_matrix(y_test, y_pred_test)
# #commenting to work for both bina nd multi
# #tn_test, fp_test, fn_test, tp_test = [int(x) for x in cm_test.ravel()]
# #added
# metrics_test = {
# "confusionMatrix": cm_test.tolist()
# # Convert to list for JSON compatibility
# }
# # Calculate confusion matrix for training data (binary classification)
# cm_train = confusion_matrix(y_train, y_pred_train)
# #tn_train, fp_train, fn_train, tp_train = [int(x) for x in cm_train.ravel()]
# #added
# metrics_train = {
# "confusionMatrix": cm_train.tolist()
# }

num_classes = len(set(y_train))
class_labels = [f'class{i}' for i in range(num_classes)]

#added
# Metrics for test data
metrics_test.update({
"accuracyScore": round(float(accuracy_score(y_test, y_pred_test)), 3),
"f1_per_class": [round(f1, 3) for f1 in f1_score(y_test, y_pred_test,
average=None)],
"precision_per_class": [round(prec, 3) for prec in precision_score(y_test,
y_pred_test, average=None)],
"recall_per_class": [round(rec, 3) for rec in recall_score(y_test,
y_pred_test, average=None)],

"specificity_per_class":{class_labels[i]: round(s, 3) for i, s in

enumerate(calculate_specificity(y_test, y_pred_test, num_classes))}

})

# Metrics for training data

metrics_train.update({
"accuracyScore": round(float(accuracy_score(y_train, y_pred_train)), 3),
"f1_per_class": [round(f1, 3) for f1 in f1_score(y_train, y_pred_train,
average=None)],
"precision_per_class": [round(prec, 3) for prec in precision_score(y_train,
y_pred_train, average=None)],
"recall_per_class": [round(rec, 3) for rec in recall_score(y_train,
y_pred_train, average=None)],

"specificity_per_class":{class_labels[i]: round(s, 3) for i, s in

enumerate(calculate_specificity(y_train, y_pred_train, num_classes))}
})

#adding code for hasattr

if hasattr(model, "predict_proba"):
y_prob_test = model.predict_proba(X_test)
y_prob_train = model.predict_proba(X_train)

if is_binary:
y_prob_test_binary = y_prob_test[:, 1]
y_prob_train_binary = y_prob_train[:, 1]
fpr_test, tpr_test,thresholds_test = roc_curve(y_test,
y_prob_test_binary)

valid_thresholds = []
for threshold in thresholds_test:
if np.isfinite(threshold):
valid_thresholds.append(round(float(threshold), 2))

metrics_test["aucRocCurve"] = {
"auc": round(roc_auc_score(y_test, y_prob_test_binary), 2),
"curve": [{"fpr": round(fpr, 2),
"tpr": round(tpr, 2),
"thresholds": threshold}
for fpr, tpr,threshold in zip(fpr_test,
tpr_test,valid_thresholds)][:10]}

else:
from sklearn.preprocessing import label_binarize
# Binarize labels for multi-class
classes = np.unique(np.concatenate([y_train, y_test]))
y_test_binarized = label_binarize(y_test, classes=classes)

metrics_test["aucRocCurve"] = [] # Initialize as a list

auc_scores = []
for i in range(y_prob_test.shape[1]):
auc_test = roc_auc_score(y_test_binarized[:, i], y_prob_test[:, i])
auc_scores.append(auc_test)

fpr_test, tpr_test, thresholds_train =

roc_curve(y_test_binarized[:, i], y_prob_test[:, i])

valid_thresholds = []
for threshold in thresholds_train:
if np.isfinite(threshold):
valid_thresholds.append(round(float(threshold), 2))

metrics_test["aucRocCurve"].append({
"className": f"class_{i}", # Add className
"auc": round(auc_test, 2),
"data": [ # Rename "curve" to "data"
{
"fpr": round(fpr, 2),
"tpr": round(tpr, 2),
"thresholds": threshold
}
for fpr, tpr, threshold in zip(fpr_test, tpr_test,
valid_thresholds)
][:10] # Limit to the first 10 points
})

# Adding average_auc as a summary

average_auc = np.mean(auc_scores)
metrics_test["aucRocCurve"].append({
"className": "average_auc",
"auc": round(average_auc, 2)
})

# # Common Metrics
metrics_train["confusionMatrix"] = confusion_matrix(y_train,
model.predict(X_train)).tolist()
metrics_train["accuracyScore"] = accuracy_score(y_train,
model.predict(X_train))
metrics_train["precision_per_class"] = precision_score(y_train,
model.predict(X_train), average=None) # Per-class precision
metrics_train["f1_per_class"] = f1_score(y_train, model.predict(X_train),
average=None) # Per-class F1 score
metrics_train["recall_per_class"] = recall_score(y_train,
model.predict(X_train), average=None)
metrics_train["specificity_per_class"] = calculate_specificity(y_train,
model.predict(X_train), num_classes) # Per-class recall
# metrics_train["precision"] = precision_score(y_train, model.predict(X_train),
average="weighted")
# metrics_train["f1Score"] = f1_score(y_train, model.predict(X_train),
average="weighted")
# metrics_train["recall"] = recall_score(y_train, model.predict(X_train),
average="weighted")

metrics_test["confusionMatrix"] = confusion_matrix(y_test,
model.predict(X_test)).tolist()
metrics_test["accuracyScore"] = accuracy_score(y_test, model.predict(X_test))
metrics_test["precision_per_class"] = precision_score(y_test,
model.predict(X_test), average=None) # Per-class precision
metrics_test["f1_per_class"] = f1_score(y_test, model.predict(X_test),
average=None) # Per-class F1 score
metrics_test["recall_per_class"] = recall_score(y_test, model.predict(X_test),
average=None) # Per-class recall
metrics_test["specificity_per_class"] = calculate_specificity(y_test,
y_pred_test, num_classes)

# Calculate average metrics for training data

precision_avg_train = np.mean(metrics_train["precision_per_class"])
recall_avg_train = np.mean(metrics_train["recall_per_class"])
f1_avg_train = np.mean(metrics_train["f1_per_class"])
specificity_avg_train = np.mean(metrics_train["specificity_per_class"])

precision_avg_test = np.mean(metrics_test["precision_per_class"])
recall_avg_test = np.mean(metrics_test["recall_per_class"])
f1_avg_test = np.mean(metrics_test["f1_per_class"])
specificity_avg_test = np.mean(metrics_test["specificity_per_class"])

metrics_train.update({
"average_precision": round(precision_avg_train, 3),
"average_recall": round(recall_avg_train, 3),
"average_f1": round(f1_avg_train, 3),
"average_specificity": round(specificity_avg_train, 3)
})
metrics_test.update({
"average_precision": round(precision_avg_test, 3),
"average_recall": round(recall_avg_test, 3),
"average_f1": round(f1_avg_test, 3),
"average_specificity": round(specificity_avg_test, 3)
})

# Update response metrics to include TP, TN, FP, FN per class for train and
test
metrics_test.update({
# "tp_tn_fp_fn_per_class": tp_tn_fp_fn_per_class
"tp_tn_fp_fn_per_class": metrics_test
})

metrics_train.update({
"tp_tn_fp_fn_per_class": metrics_train
})

# metrics_test["precision"] = precision_score(y_test, model.predict(X_test),

average="weighted")
# metrics_test["f1Score"] = f1_score(y_test, model.predict(X_test),
average="weighted")
# metrics_test["recall"] = recall_score(y_test, model.predict(X_test),
average="weighted")
auc_roc_curve = metrics_test.get("aucRocCurve", [])

# Validate that 'aucRocCurve' is a list before iterating

if isinstance(auc_roc_curve, list):
average_auc_entry = next((entry for entry in auc_roc_curve if
isinstance(entry, dict) and entry.get("className") == "average_auc"), None)
else:
average_auc_entry = None

# Extract the 'auc' value or set it to None if not found

average_auc_value = average_auc_entry["auc"] if average_auc_entry else None

# metrics_train = datatype(metrics_train)
# metrics_test = datatype(metrics_test)
# print("--------------------metrics_train--------------",
type(metrics_train["tp_tn_fp_fn_per_class"][0]['TP']))
# if isinstance(metrics_test, (np.int64, np.int32)):
# return int(metrics_test)
# if isinstance(metrics_train, (np.int64, np.int32)):
# return int(metrics_train)

# metrics_train["tp_tn_fp_fn_per_class"][0]
['TP']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]
['TN']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TN'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]
['FP']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['FP'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]
['FN']=int(metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['FN'] =
json.dumps(metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])

# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_train["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['TP'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['TN'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['FP'])
# metrics_test["tp_tn_fp_fn_per_class"][0]['TP']=
int(metrics_test["tp_tn_fp_fn_per_class"][0]['FN'])

# print("--------------------metrics_train--------------",
type(metrics_train["tp_tn_fp_fn_per_class"][0]['TP']))
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['TP'])
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['TN'])
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['FP'])
# print("--------------------metrics_train--------------",
metrics_train["tp_tn_fp_fn_per_class"][0]['FN'])
# train_classes_metrics = {

# f"tp_tn_fp_fn_per_class{i}": {
# "TP": int(metrics_train["tp_tn_fp_fn_per_class"][i]["TP"]),
# "TN": int(metrics_train["tp_tn_fp_fn_per_class"][i]["TN"]),
# "FP": int(metrics_train["tp_tn_fp_fn_per_class"][i]["FP"]),
# "FN": int(metrics_train["tp_tn_fp_fn_per_class"][i]["FN"])
# }
# for i in range(len(metrics_train["tp_tn_fp_fn_per_class"]))
# }

# test_classes_metrics = {
# f"tp_tn_fp_fn_per_class{i}": {
# "TP": int(metrics_test["tp_tn_fp_fn_per_class"][i]["TP"]),
# "TN": int(metrics_test["tp_tn_fp_fn_per_class"][i]["TN"]),
# "FP": int(metrics_test["tp_tn_fp_fn_per_class"][i]["FP"]),
# "FN": int(metrics_test["tp_tn_fp_fn_per_class"][i]["FN"])
# }
# for i in range(len(metrics_test["tp_tn_fp_fn_per_class"]))
# }

# Response format
accuracy_result = {
"trainReport": {
"metrics": [
{
"confusionMatrix": {
"matrix": metrics_train["confusionMatrix"]
},
"tp_tn_fp_fn_per_class0": {
"TP": int(metrics_train["tp_tn_fp_fn_per_class"][0]["TP"]),
"TN": int(metrics_train["tp_tn_fp_fn_per_class"][0]["TN"]),
"FP": int(metrics_train["tp_tn_fp_fn_per_class"][0]["FP"]),
"FN": int(metrics_train["tp_tn_fp_fn_per_class"][0]["FN"])
},
"accuracyScore": metrics_train["accuracyScore"],
# "precision": metrics_train["precision"],
# "f1Score": metrics_train["f1Score"],
# "recall": metrics_train["recall"],
"precision_per_class":
metrics_train["precision_per_class"].tolist(),
"recall_per_class": metrics_train["recall_per_class"].tolist(),
"f1_per_class": metrics_train["f1_per_class"].tolist(),
"specificity_per_class": metrics_train["specificity_per_class"],
#"tp_tn_fp_fn_per_class": metrics_train["tp_tn_fp_fn_per_class"],
"average_precision": metrics_train["average_precision"],
"average_recall": metrics_train["average_recall"],
"average_f1": metrics_train["average_f1"],
"average_specificity": metrics_train["average_specificity"],
"aucRocCurve": metrics_train.get("aucRocCurve", []), #
Only the first 10 values

"bestModel": False,
"modelId": modelId,
"hyperparameterTuning": hyperparameterTuning,
"modelCategory":modelCategory,
"modelName": modelName,

"metadata": {
"note": "Train report, first model"
},
# "_id": "train_id"
}
]
},
"testReport": {
"metrics": [
{
"confusionMatrix": {
"matrix": metrics_test["confusionMatrix"],

},
"tp_tn_fp_fn_per_class0": {
"TP": int(metrics_test["tp_tn_fp_fn_per_class"][0]["TP"]),
"TN": int(metrics_test["tp_tn_fp_fn_per_class"][0]["TN"]),
"FP": int(metrics_test["tp_tn_fp_fn_per_class"][0]["FP"]),
"FN": int(metrics_test["tp_tn_fp_fn_per_class"][0]["FN"])
},
"accuracyScore": metrics_test["accuracyScore"],
# "precision": metrics_test["precision"],
# "f1Score": metrics_test["f1Score"],
# "recall": metrics_test["recall"],
"precision_per_class":
metrics_test["precision_per_class"].tolist(),
"recall_per_class": metrics_test["recall_per_class"].tolist(),
"f1_per_class": metrics_test["f1_per_class"].tolist(),
"specificity_per_class": metrics_test["specificity_per_class"],
# "tp_tn_fp_fn_per_class": metrics_test["tp_tn_fp_fn_per_class"],
"average_precision": metrics_test["average_precision"],
"average_recall": metrics_test["average_recall"],
"average_f1": metrics_test["average_f1"],
"average_specificity": metrics_test["average_specificity"],
"aucRocCurve": metrics_test.get("aucRocCurve", []),
#"average_auc":
metrics_test["aucRocCurve"].get("average_auc"),
"average_auc": average_auc_value,
"bestModel": True,
"modelId": modelId,
"hyperparameterTuning": hyperparameterTuning,
"modelCategory":modelCategory,
"modelName": modelName,

"metadata": {
"note": "Test report, best model"
},
# "_id": "test_id"
}
]
},
# "_id": "6717b977cdd6733dfbd91477",
"workflowId": workflowId,
# "trainRatio": trainRatio_per,
# "testRatio": testRatio_per,
"projectId": projectId,
# "userId": userId,
# "jobId": "650deef95c34b36d9e12d71f",
"versionId": "V1.0", # Hardcoded for now, will be made
dynamic when the need arises
# "dataSources": dataSources,
"keyMetrics": keyMetrics,
"targetColumn": targetColumn,
"createdAt": createdAt,
"updatedAt": updatedAt,
# "__v": 0
}
print("-------------------- Payload defined
Successfully-------------------------------")

## Save results to S3
save_results_to_s3(accuracy_result, modelName)
print("-------------------- 111111111111-------------------------------")

# Return metrics
return accuracy_result

def save_results_to_s3(results, modelName):

# Convert results to JSON
results_json = json.dumps(results)

# Define the S3 object key

object_key = f"model_evaluation.json"

# Upload to S3
s3.put_object(Bucket=S3_BUCKET_NAME, Key=object_key, Body=results_json)

Primesim Hspice Sa
No ratings yet
Primesim Hspice Sa
769 pages
Quiz Game Documentation
71% (14)
Quiz Game Documentation
33 pages
Atmel 42365 SAM C21 Datasheet
No ratings yet
Atmel 42365 SAM C21 Datasheet
1,194 pages
Jquery For Designers Beginners Guide 2nd Edition Natalie Maclees PDF Download
No ratings yet
Jquery For Designers Beginners Guide 2nd Edition Natalie Maclees PDF Download
65 pages
Kedar Py 123
No ratings yet
Kedar Py 123
14 pages
Car Mock - ML Ans
No ratings yet
Car Mock - ML Ans
6 pages
Loan ML Complete Guide
No ratings yet
Loan ML Complete Guide
3 pages
Student Abandonment Classification in Brazil
No ratings yet
Student Abandonment Classification in Brazil
59 pages
24 - PW44 - IEC 61850 Based Centralized Substation Protection, Automation and Control - Principles and Benefits
No ratings yet
24 - PW44 - IEC 61850 Based Centralized Substation Protection, Automation and Control - Principles and Benefits
9 pages
Untitled Document
No ratings yet
Untitled Document
2 pages
ML Functions
No ratings yet
ML Functions
12 pages
Online Analytical Processing (OLAP) Groupwork
No ratings yet
Online Analytical Processing (OLAP) Groupwork
8 pages
Iii Aid - ML
No ratings yet
Iii Aid - ML
30 pages
Machine Learning Lab Assignment 1
No ratings yet
Machine Learning Lab Assignment 1
23 pages
ML Complete Notes Hridoy
No ratings yet
ML Complete Notes Hridoy
5 pages
Model Evaluation and Selection Cheatsheet 1708023215
No ratings yet
Model Evaluation and Selection Cheatsheet 1708023215
7 pages
ELTP Extending ELT For Modern AI and Analytics Airbyte
No ratings yet
ELTP Extending ELT For Modern AI and Analytics Airbyte
9 pages
Lê Hoàng Anh Duy - Spark Machine Learning
No ratings yet
Lê Hoàng Anh Duy - Spark Machine Learning
133 pages
Data Preprocessing Example Programs1
No ratings yet
Data Preprocessing Example Programs1
9 pages
AI
No ratings yet
AI
16 pages
Networking
No ratings yet
Networking
6 pages
MS Paint Old
No ratings yet
MS Paint Old
7 pages
1
No ratings yet
1
13 pages
Enda Practical 3 Explanation One
No ratings yet
Enda Practical 3 Explanation One
7 pages
TP - Ipynb - Colab
No ratings yet
TP - Ipynb - Colab
6 pages
Notebook - Main Code
No ratings yet
Notebook - Main Code
4 pages
Fyp 4
No ratings yet
Fyp 4
12 pages
Progress of CATBOOST ALGORITHM FOR ELECTRICITY THEFT DETECTION IN POWER UTILITIES
No ratings yet
Progress of CATBOOST ALGORITHM FOR ELECTRICITY THEFT DETECTION IN POWER UTILITIES
9 pages
ML Assignment
No ratings yet
ML Assignment
34 pages
Toshiba Qosmio x70-A Quanta Bdd Da0bddmb8h0 Rev a1a Схема
100% (1)
Toshiba Qosmio x70-A Quanta Bdd Da0bddmb8h0 Rev a1a Схема
37 pages
23BCE7199 ML Lab Assignment
No ratings yet
23BCE7199 ML Lab Assignment
15 pages
Atul MLT Exp 4-11
No ratings yet
Atul MLT Exp 4-11
17 pages
Deep Learning
No ratings yet
Deep Learning
13 pages
Naive Bayes Classification
No ratings yet
Naive Bayes Classification
8 pages
23BCE7092 ML Lab Assignment
No ratings yet
23BCE7092 ML Lab Assignment
14 pages
Ann Experiential Learning
No ratings yet
Ann Experiential Learning
43 pages
Web Development Policies and Procedures
No ratings yet
Web Development Policies and Procedures
13 pages
DevOps 1 With Lifecycle
100% (2)
DevOps 1 With Lifecycle
39 pages
Sony CRM
No ratings yet
Sony CRM
9 pages
Personalized Cancer Diagnosis
No ratings yet
Personalized Cancer Diagnosis
100 pages
Guide: Maya Accessories Information System Manual
No ratings yet
Guide: Maya Accessories Information System Manual
3 pages
ML File
No ratings yet
ML File
13 pages
Advance Machine Learning
No ratings yet
Advance Machine Learning
28 pages
Import Pandas As PD
No ratings yet
Import Pandas As PD
21 pages
Code 1
No ratings yet
Code 1
3 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
009 Bipolar Disorder PPT Presentation Template and Google Slides Theme For Free
No ratings yet
009 Bipolar Disorder PPT Presentation Template and Google Slides Theme For Free
26 pages
Homework 4
No ratings yet
Homework 4
3 pages
MACHINE LEARNING Manual
No ratings yet
MACHINE LEARNING Manual
36 pages
NF Assighment4
No ratings yet
NF Assighment4
5 pages
EDS - Python Cheat Sheet
0% (1)
EDS - Python Cheat Sheet
3 pages
Machine Learning Model Building
No ratings yet
Machine Learning Model Building
6 pages
Circuits Simulation Lab: Department of Technical Education
No ratings yet
Circuits Simulation Lab: Department of Technical Education
18 pages
8 GHZ To 16 GHZ, 4-Channel, X Band and Ku Band Beamformer: Adar1000
No ratings yet
8 GHZ To 16 GHZ, 4-Channel, X Band and Ku Band Beamformer: Adar1000
65 pages
ML File 211173
No ratings yet
ML File 211173
19 pages
C121 Exp1
No ratings yet
C121 Exp1
32 pages
C121 Exp2
No ratings yet
C121 Exp2
23 pages
ML 1-10
No ratings yet
ML 1-10
53 pages
EDA Pipeline Final
No ratings yet
EDA Pipeline Final
7 pages
Tours Csharp Project Proposal
No ratings yet
Tours Csharp Project Proposal
2 pages
Akka Scala
No ratings yet
Akka Scala
399 pages
ML Week10.1
No ratings yet
ML Week10.1
5 pages
Practical 3 - Categorical Feature Engineering
No ratings yet
Practical 3 - Categorical Feature Engineering
6 pages
Aiml 5-8
No ratings yet
Aiml 5-8
19 pages
Deep Learning Perceptron
No ratings yet
Deep Learning Perceptron
10 pages
Programming Assignment 2: Priority Queues and Disjoint Sets
No ratings yet
Programming Assignment 2: Priority Queues and Disjoint Sets
11 pages
Pattern Recognition Lab
No ratings yet
Pattern Recognition Lab
24 pages
Mercedes-Benz Greener Manufacturing Ai
0% (1)
Mercedes-Benz Greener Manufacturing Ai
16 pages
AR-NB2: Network Expansion Kit
No ratings yet
AR-NB2: Network Expansion Kit
32 pages
Object Oriented Programming Using C++ Second Year Sem II: Two Marks Questions
100% (2)
Object Oriented Programming Using C++ Second Year Sem II: Two Marks Questions
6 pages
Import As Import As Import As Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import
No ratings yet
Import As Import As Import As Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import
8 pages
Dwdm-Lab Manual
No ratings yet
Dwdm-Lab Manual
39 pages
Forcepoint Ipsec Guide: Forcepoint Web Security Cloud
No ratings yet
Forcepoint Ipsec Guide: Forcepoint Web Security Cloud
36 pages
Titanic Akshaya
No ratings yet
Titanic Akshaya
12 pages
100 Days of Machine Learning
No ratings yet
100 Days of Machine Learning
14 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
FB 1200
No ratings yet
FB 1200
110 pages
CC880
No ratings yet
CC880
75 pages
Aiml Ex 4-7
No ratings yet
Aiml Ex 4-7
8 pages
5) Randomforest - Ipynb - Colaboratory
No ratings yet
5) Randomforest - Ipynb - Colaboratory
12 pages
Approachin190808095205 PDF
No ratings yet
Approachin190808095205 PDF
112 pages
Fall Semester 2020-21 AI With Python ECE-4031
No ratings yet
Fall Semester 2020-21 AI With Python ECE-4031
5 pages
Lab 08 - Data Preprocessing
No ratings yet
Lab 08 - Data Preprocessing
9 pages
Cad Cam Notes
No ratings yet
Cad Cam Notes
21 pages
74-0436 Command Center Op Manual
No ratings yet
74-0436 Command Center Op Manual
42 pages
Assignment 2.4.1 Multiclass Classification
No ratings yet
Assignment 2.4.1 Multiclass Classification
5 pages
Detail Report v0.1
No ratings yet
Detail Report v0.1
60 pages
Machine Learning Lab (17CSL76)
No ratings yet
Machine Learning Lab (17CSL76)
48 pages
Machine Learning Laboratory Manual
No ratings yet
Machine Learning Laboratory Manual
11 pages
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet