0% found this document useful (0 votes)

13 views35 pages

Experiment01 Baseline Models Accuracy

Uploaded by

sumitpatelreso

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

13 views35 pages

Experiment01 Baseline Models Accuracy

Uploaded by

sumitpatelreso

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 35

!

python --version

Python 3.10.13

%load_ext autoreload
%autoreload 2

import os, time, datetime

import json, pickle
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import tree based models

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# training/testing utils
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight,
compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,
roc_auc_score
from sklearn.utils.class_weight import compute_sample_weight,
compute_class_weight

# Symbolic regression
from gplearn.genetic import SymbolicClassifier

Load and prepare data

data = pd.read_csv('../data/data_diet_filtered.csv', index_col=0)

data.disease, enc_values = pd.factorize(data.disease) # to get back

original labels, do "enc_values[data.disease]"

drop_col = ['index', 'disease', 'subject_id','gender',

'country','age_category', 'diet']
X = data.drop(labels=drop_col, axis=1)
y = data.disease.values

class_map = {v: k for v, k in enumerate(list(enc_values.values))} #

coded label to names
class_counts = {k: len(y[y==v]) for v, k in
enumerate(list(enc_values.values))}
assert sum([class_counts[k] for k in class_counts]) == len(X), 'total
#samples not matching when summing for each class'

print(data.shape, X.shape, y.shape, class_map, class_counts)

(14560, 756) (14560, 749) (14560,) {0: 'healthy', 1: 'IBD', 2: 'CRC',

3: 'adenoma', 4: 'T2D'} {'healthy': 10761, 'IBD': 1736, 'CRC': 701,
'adenoma': 209, 'T2D': 1153}

/var/folders/qp/4w02sqhj6_d43815lhzjb7900000gn/T/
ipykernel_73514/129412594.py:1: DtypeWarning: Columns (756) have mixed
types. Specify dtype option on import or set low_memory=False.
data = pd.read_csv('../data/data_diet_filtered.csv', index_col=0)

# only take normalized rows!

normalized_idx = (X.sum(1) > 99)
X = X[normalized_idx]
y = y[X.index]

print(f"X.shape: {X.shape}, y.shape: {y.shape}")

X.shape: (14156, 749), y.shape: (14156,)

from srmb.utils import calculate_metrics

Healthy vs. CRC classification

For the subsequent analysis, we will only choose the healthy and CRC patients from the dataset.

from srmb.fitness_functions import customacc

from srmb.special_functions import (
presence, absence, add3, add10, ifelse, ifelseless,
presence2, absence2,
presence3, absence3
)

Error exception: Only one class present in y_true. ROC AUC score is
not defined in that case.

k = 2
idxs = (y == 0) | (y == k) # get healthy and that class' data

X1, y1 = X.iloc[idxs], y[idxs]

y1[y1 == k] = 1 # relabel 1 --> CRC, 0 --> healthy
print(f'doing for class = {class_map[k]}, {X1.shape=}, {y1.shape=},
#{class_map[k]} samples = {y1.sum()}')

doing for class = CRC, X1.shape=(11137, 749), y1.shape=(11137,), #CRC

samples = 664
1. Experiments with undersampling the healthy
class
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

def store_result(key, acc, f1, auroc):

ACCURACIES[key].append(acc)
F1SCORES[key].append(f1)
AUCROCS[key].append(auroc)

def convert_to_arrays(prec=4):
for key in MODEL_NAMES:
ACCURACIES[key] = np.around(np.asarray(ACCURACIES[key]), prec)
F1SCORES[key] = np.around(np.asarray(F1SCORES[key]), prec)
AUCROCS[key] = np.around(np.asarray(AUCROCS[key]), prec)

# seeds for 20 trials

RANDOM_SEEDS_FOR_UNDERSAMPLING = [42, 2024, 1234, 2405, 11, 9345,
858, 8590, 4754, 1959,
707, 10524, 83946, 63297, 78035,
22664, 49283, 35253, 82273, 90378]
MODEL_NAMES = ['LR', 'DT', 'RF', 'XG', 'SR', 'SRf']
ACCURACIES = {k: [] for k in MODEL_NAMES}
F1SCORES = {k: [] for k in MODEL_NAMES}
AUCROCS = {k: [] for k in MODEL_NAMES}

SRmodels = []
SRfmodels = []

USE_BALANCED_SUBSAMPLE=True # perform undersampling of healthy classes

for random_state in RANDOM_SEEDS_FOR_UNDERSAMPLING:

rus = RandomUnderSampler(sampling_strategy=0.85, # this is another
hyperparameter
random_state=random_state)
X1b, y1b = rus.fit_resample(X1, y1)
print(f'seed={random_state}')
# print(sorted(Counter(y1b).items()))

if not USE_BALANCED_SUBSAMPLE:
X_train, X_test, y_train, y_test = train_test_split(X1, y1,

test_size=0.25,
#
train_size=0.5, # if slow use this

random_state=random_state, stratify=y1)
class_weight = compute_class_weight(class_weight='balanced',
classes=np.unique(y1), y=y1)
else:
# print('using a balanced subsample of the data ...')
X_train, X_test, y_train, y_test = train_test_split(X1b, y1b,

test_size=0.25,
#
train_size=0.5, # if slow use this

random_state=42, stratify=y1b)
class_weight = compute_class_weight(class_weight='balanced',
classes=np.unique(y1b), y=y1b)

sample_weights = compute_sample_weight(class_weight='balanced',
y=y_train)

# logistic regression classifier

# Decision tree classifier

model_dt = DecisionTreeClassifier(max_depth=5,
class_weight=dict(enumerate(class_weight)))
model_dt.fit(X_train, y_train)
store_result('DT', *calculate_metrics(model_dt, X_train, y_train,
X_test, y_test))

# Random forest classifier

model_rf = RandomForestClassifier(n_estimators=50,
class_weight=dict(enumerate(class_weight)))
model_rf.fit(X_train, y_train)
store_result('RF', *calculate_metrics(model_rf, X_train, y_train,
X_test, y_test))

# Create an XGBoost classifier for multiclass classification

model_xg = XGBClassifier(n_estimators=50, max_depth=5,
learning_rate=0.1, objective='binary:logistic')
model_xg.fit(X_train, y_train, sample_weight=sample_weights)
store_result('XG', *calculate_metrics(model_xg, X_train, y_train,
X_test, y_test))

# Do it for vanilla symbolic regression

function_set = ['add', 'sub', 'mul', 'div', 'neg', 'max', 'min',
'sqrt', 'log']

est = SymbolicClassifier(population_size=6000,
generations=20,
tournament_size=25,

init_depth=(2, 6),
const_range=(0., 100.),
# init_method="full",
parsimony_coefficient=0.001,
function_set=function_set,

stopping_criteria=1.0, metric=customacc,
#use custom acc as fitness

feature_names=X1.columns.to_list(),
# verbose=True,
random_state=42)

t0 = time.time()
est.fit(X_train, y_train)
print('Time to fit symbolic classifier:', time.time() - t0,
'seconds')
store_result('SR', *calculate_metrics(est, X_train, y_train,
X_test, y_test))
SRmodels.append(est)

# SR with special functions

special_functions = [presence, absence, presence2, absence2,
ifelse]#, add3, add10]
function_set = ['add', 'sub', 'mul', 'div', 'neg', 'max', 'min'] +
special_functions

est = SymbolicClassifier(population_size=6000,
generations=20,
tournament_size=25,

init_depth=(2, 6),
const_range=(0., 100.),
# init_method="full",
parsimony_coefficient=0.001,
function_set=function_set,

stopping_criteria=1.0, metric=customacc,
#use custom acc as fitness

feature_names=X1.columns.to_list(),
# verbose=True,
random_state=42)

t0 = time.time()
est.fit(X_train, y_train)
print('Time to fit symbolic classifier:', time.time() - t0,
'seconds')
store_result('SRf', *calculate_metrics(est, X_train, y_train,
X_test, y_test))
SRfmodels.append(est)

seed=42

/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7265 Test AUROC: 0.7981 Test F1 score: 0.7009
DecisionTreeClassifier
Test accuracy: 0.7514 Test AUROC: 0.8186 Test F1 score: 0.7078
RandomForestClassifier
Test accuracy: 0.8011 Test AUROC: 0.9015 Test F1 score: 0.7647
XGBClassifier
Test accuracy: 0.8260 Test AUROC: 0.9131 Test F1 score: 0.8013
Time to fit symbolic classifier: 42.2771680355072 seconds
SymbolicClassifier
Test accuracy: 0.7431 Test AUROC: 0.7488 Test F1 score: 0.6847
Time to fit symbolic classifier: 41.15023899078369 seconds
SymbolicClassifier
Test accuracy: 0.7514 Test AUROC: 0.7579 Test F1 score: 0.6939
seed=2024

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7210 Test AUROC: 0.7726 Test F1 score: 0.6967
DecisionTreeClassifier
Test accuracy: 0.7182 Test AUROC: 0.7402 Test F1 score: 0.6982
RandomForestClassifier
Test accuracy: 0.8122 Test AUROC: 0.8961 Test F1 score: 0.7792
XGBClassifier
Test accuracy: 0.8177 Test AUROC: 0.9159 Test F1 score: 0.7911
Time to fit symbolic classifier: 41.51435089111328 seconds
SymbolicClassifier
Test accuracy: 0.7569 Test AUROC: 0.7640 Test F1 score: 0.6966
Time to fit symbolic classifier: 41.539920806884766 seconds
SymbolicClassifier
Test accuracy: 0.7541 Test AUROC: 0.7641 Test F1 score: 0.6962
seed=1234

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7155 Test AUROC: 0.7885 Test F1 score: 0.6997
DecisionTreeClassifier
Test accuracy: 0.7845 Test AUROC: 0.8185 Test F1 score: 0.7365
RandomForestClassifier
Test accuracy: 0.8287 Test AUROC: 0.9015 Test F1 score: 0.7947
XGBClassifier
Test accuracy: 0.8398 Test AUROC: 0.9181 Test F1 score: 0.8165
Time to fit symbolic classifier: 43.203129053115845 seconds
SymbolicClassifier
Test accuracy: 0.7403 Test AUROC: 0.7617 Test F1 score: 0.7006
Time to fit symbolic classifier: 44.14738321304321 seconds
SymbolicClassifier
Test accuracy: 0.7652 Test AUROC: 0.7692 Test F1 score: 0.7079
seed=2405
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7818 Test AUROC: 0.8474 Test F1 score: 0.7508
DecisionTreeClassifier
Test accuracy: 0.7348 Test AUROC: 0.7518 Test F1 score: 0.6667
RandomForestClassifier
Test accuracy: 0.8122 Test AUROC: 0.8982 Test F1 score: 0.7655
XGBClassifier
Test accuracy: 0.8177 Test AUROC: 0.9079 Test F1 score: 0.7925
Time to fit symbolic classifier: 43.750773906707764 seconds
SymbolicClassifier
Test accuracy: 0.7459 Test AUROC: 0.7500 Test F1 score: 0.6913
Time to fit symbolic classifier: 43.03351879119873 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7465 Test F1 score: 0.6851
seed=11

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7238 Test AUROC: 0.7970 Test F1 score: 0.7059
DecisionTreeClassifier
Test accuracy: 0.7597 Test AUROC: 0.7982 Test F1 score: 0.7307
RandomForestClassifier
Test accuracy: 0.7707 Test AUROC: 0.8785 Test F1 score: 0.7314
XGBClassifier
Test accuracy: 0.8232 Test AUROC: 0.9068 Test F1 score: 0.8012
Time to fit symbolic classifier: 43.38354301452637 seconds
SymbolicClassifier
Test accuracy: 0.7597 Test AUROC: 0.7643 Test F1 score: 0.7010
Time to fit symbolic classifier: 43.27070212364197 seconds
SymbolicClassifier
Test accuracy: 0.7155 Test AUROC: 0.7301 Test F1 score: 0.6532
seed=9345

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.6989 Test AUROC: 0.7772 Test F1 score: 0.6625
DecisionTreeClassifier
Test accuracy: 0.7155 Test AUROC: 0.7750 Test F1 score: 0.6485
RandomForestClassifier
Test accuracy: 0.8039 Test AUROC: 0.8891 Test F1 score: 0.7641
XGBClassifier
Test accuracy: 0.8370 Test AUROC: 0.9182 Test F1 score: 0.8103
Time to fit symbolic classifier: 43.272748947143555 seconds
SymbolicClassifier
Test accuracy: 0.7459 Test AUROC: 0.7491 Test F1 score: 0.6892
Time to fit symbolic classifier: 42.66357135772705 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7533 Test F1 score: 0.6915
seed=858

Increase the number of iterations (max_iter) or scale the data as

LogisticRegression
Test accuracy: 0.7624 Test AUROC: 0.8271 Test F1 score: 0.7346
DecisionTreeClassifier
Test accuracy: 0.7735 Test AUROC: 0.7956 Test F1 score: 0.7500
RandomForestClassifier
Test accuracy: 0.8315 Test AUROC: 0.9196 Test F1 score: 0.7973
XGBClassifier
Test accuracy: 0.8204 Test AUROC: 0.9280 Test F1 score: 0.7855
Time to fit symbolic classifier: 43.11080884933472 seconds
SymbolicClassifier
Test accuracy: 0.7569 Test AUROC: 0.7890 Test F1 score: 0.6986
Time to fit symbolic classifier: 43.20073890686035 seconds
SymbolicClassifier
Test accuracy: 0.7348 Test AUROC: 0.7411 Test F1 score: 0.6643
seed=8590

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7403 Test AUROC: 0.8040 Test F1 score: 0.7134
DecisionTreeClassifier
Test accuracy: 0.7514 Test AUROC: 0.7889 Test F1 score: 0.6897
RandomForestClassifier
Test accuracy: 0.8122 Test AUROC: 0.9011 Test F1 score: 0.7792
XGBClassifier
Test accuracy: 0.8232 Test AUROC: 0.9098 Test F1 score: 0.7987
Time to fit symbolic classifier: 42.94043493270874 seconds
SymbolicClassifier
Test accuracy: 0.7541 Test AUROC: 0.7544 Test F1 score: 0.6899
Time to fit symbolic classifier: 43.01768684387207 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7573 Test F1 score: 0.6915
seed=4754
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7762 Test AUROC: 0.8377 Test F1 score: 0.7461
DecisionTreeClassifier
Test accuracy: 0.7348 Test AUROC: 0.7938 Test F1 score: 0.6620
RandomForestClassifier
Test accuracy: 0.8260 Test AUROC: 0.9111 Test F1 score: 0.7921
XGBClassifier
Test accuracy: 0.8315 Test AUROC: 0.9308 Test F1 score: 0.8039
Time to fit symbolic classifier: 43.672722816467285 seconds
SymbolicClassifier
Test accuracy: 0.7762 Test AUROC: 0.7759 Test F1 score: 0.7138
Time to fit symbolic classifier: 43.33226490020752 seconds
SymbolicClassifier
Test accuracy: 0.7597 Test AUROC: 0.7694 Test F1 score: 0.7031
seed=1959

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7624 Test AUROC: 0.8320 Test F1 score: 0.7312
DecisionTreeClassifier
Test accuracy: 0.6878 Test AUROC: 0.7568 Test F1 score: 0.6744
RandomForestClassifier
Test accuracy: 0.8287 Test AUROC: 0.8998 Test F1 score: 0.8050
XGBClassifier
Test accuracy: 0.7928 Test AUROC: 0.8996 Test F1 score: 0.7734
Time to fit symbolic classifier: 42.98330283164978 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7580 Test F1 score: 0.6915
Time to fit symbolic classifier: 44.140511989593506 seconds
SymbolicClassifier
Test accuracy: 0.7652 Test AUROC: 0.7697 Test F1 score: 0.7157
seed=707

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7624 Test AUROC: 0.8203 Test F1 score: 0.7394
DecisionTreeClassifier
Test accuracy: 0.7569 Test AUROC: 0.8137 Test F1 score: 0.6667
RandomForestClassifier
Test accuracy: 0.8260 Test AUROC: 0.9097 Test F1 score: 0.7921
XGBClassifier
Test accuracy: 0.8232 Test AUROC: 0.9166 Test F1 score: 0.7949
Time to fit symbolic classifier: 43.830246925354004 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7530 Test F1 score: 0.7217
Time to fit symbolic classifier: 43.57986807823181 seconds
SymbolicClassifier
Test accuracy: 0.7790 Test AUROC: 0.8150 Test F1 score: 0.7143
seed=10524

Increase the number of iterations (max_iter) or scale the data as

LogisticRegression
Test accuracy: 0.7514 Test AUROC: 0.8078 Test F1 score: 0.7289
DecisionTreeClassifier
Test accuracy: 0.7265 Test AUROC: 0.7624 Test F1 score: 0.6991
RandomForestClassifier
Test accuracy: 0.8204 Test AUROC: 0.8905 Test F1 score: 0.7883
XGBClassifier
Test accuracy: 0.8260 Test AUROC: 0.8991 Test F1 score: 0.8037
Time to fit symbolic classifier: 43.50123906135559 seconds
SymbolicClassifier
Test accuracy: 0.7514 Test AUROC: 0.7622 Test F1 score: 0.7039
Time to fit symbolic classifier: 43.24294877052307 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7613 Test F1 score: 0.6997
seed=83946

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7403 Test AUROC: 0.7988 Test F1 score: 0.7081
DecisionTreeClassifier
Test accuracy: 0.7238 Test AUROC: 0.7863 Test F1 score: 0.6552
RandomForestClassifier
Test accuracy: 0.8204 Test AUROC: 0.8863 Test F1 score: 0.7855
XGBClassifier
Test accuracy: 0.8287 Test AUROC: 0.9158 Test F1 score: 0.8062
Time to fit symbolic classifier: 43.89677596092224 seconds
SymbolicClassifier
Test accuracy: 0.7459 Test AUROC: 0.7638 Test F1 score: 0.6954
Time to fit symbolic classifier: 42.90026021003723 seconds
SymbolicClassifier
Test accuracy: 0.7348 Test AUROC: 0.7510 Test F1 score: 0.6800
seed=63297
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7762 Test AUROC: 0.8216 Test F1 score: 0.7508
DecisionTreeClassifier
Test accuracy: 0.7845 Test AUROC: 0.8127 Test F1 score: 0.7665
RandomForestClassifier
Test accuracy: 0.8260 Test AUROC: 0.9118 Test F1 score: 0.7948
XGBClassifier
Test accuracy: 0.8564 Test AUROC: 0.9337 Test F1 score: 0.8354
Time to fit symbolic classifier: 43.374119997024536 seconds
SymbolicClassifier
Test accuracy: 0.7652 Test AUROC: 0.7754 Test F1 score: 0.7195
Time to fit symbolic classifier: 43.408135175704956 seconds
SymbolicClassifier
Test accuracy: 0.7652 Test AUROC: 0.7666 Test F1 score: 0.7099
seed=78035

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7735 Test AUROC: 0.8348 Test F1 score: 0.7405
DecisionTreeClassifier
Test accuracy: 0.7403 Test AUROC: 0.7901 Test F1 score: 0.6846
RandomForestClassifier
Test accuracy: 0.8370 Test AUROC: 0.9148 Test F1 score: 0.8115
XGBClassifier
Test accuracy: 0.8536 Test AUROC: 0.9395 Test F1 score: 0.8296
Time to fit symbolic classifier: 43.04957699775696 seconds
SymbolicClassifier
Test accuracy: 0.7541 Test AUROC: 0.7593 Test F1 score: 0.7023
Time to fit symbolic classifier: 42.84919810295105 seconds
SymbolicClassifier
Test accuracy: 0.7762 Test AUROC: 0.8103 Test F1 score: 0.7178
seed=22664

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7790 Test AUROC: 0.8231 Test F1 score: 0.7531
DecisionTreeClassifier
Test accuracy: 0.7155 Test AUROC: 0.8195 Test F1 score: 0.7099
RandomForestClassifier
Test accuracy: 0.8287 Test AUROC: 0.9080 Test F1 score: 0.7987
XGBClassifier
Test accuracy: 0.8453 Test AUROC: 0.9225 Test F1 score: 0.8228
Time to fit symbolic classifier: 43.30705690383911 seconds
SymbolicClassifier
Test accuracy: 0.7514 Test AUROC: 0.7704 Test F1 score: 0.7020
Time to fit symbolic classifier: 44.09287762641907 seconds
SymbolicClassifier
Test accuracy: 0.7431 Test AUROC: 0.7477 Test F1 score: 0.6714
seed=49283

Increase the number of iterations (max_iter) or scale the data as

LogisticRegression
Test accuracy: 0.7541 Test AUROC: 0.8084 Test F1 score: 0.7405
DecisionTreeClassifier
Test accuracy: 0.7293 Test AUROC: 0.7896 Test F1 score: 0.7168
RandomForestClassifier
Test accuracy: 0.8287 Test AUROC: 0.9166 Test F1 score: 0.7947
XGBClassifier
Test accuracy: 0.8398 Test AUROC: 0.9318 Test F1 score: 0.8129
Time to fit symbolic classifier: 42.77116394042969 seconds
SymbolicClassifier
Test accuracy: 0.7569 Test AUROC: 0.7585 Test F1 score: 0.7007
Time to fit symbolic classifier: 45.94197463989258 seconds
SymbolicClassifier
Test accuracy: 0.7597 Test AUROC: 0.7513 Test F1 score: 0.7010
seed=35253

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7569 Test AUROC: 0.8184 Test F1 score: 0.7317
DecisionTreeClassifier
Test accuracy: 0.7514 Test AUROC: 0.8107 Test F1 score: 0.7059
RandomForestClassifier
Test accuracy: 0.8039 Test AUROC: 0.8988 Test F1 score: 0.7717
XGBClassifier
Test accuracy: 0.8094 Test AUROC: 0.9091 Test F1 score: 0.7903
Time to fit symbolic classifier: 44.62361788749695 seconds
SymbolicClassifier
Test accuracy: 0.7210 Test AUROC: 0.7384 Test F1 score: 0.6731
Time to fit symbolic classifier: 44.03053903579712 seconds
SymbolicClassifier
Test accuracy: 0.7403 Test AUROC: 0.7603 Test F1 score: 0.6928
seed=82273
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7293 Test AUROC: 0.7904 Test F1 score: 0.7048
DecisionTreeClassifier
Test accuracy: 0.7403 Test AUROC: 0.7878 Test F1 score: 0.6908
RandomForestClassifier
Test accuracy: 0.8039 Test AUROC: 0.8927 Test F1 score: 0.7609
XGBClassifier
Test accuracy: 0.8425 Test AUROC: 0.9112 Test F1 score: 0.8190
Time to fit symbolic classifier: 45.13826107978821 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7551 Test F1 score: 0.6936
Time to fit symbolic classifier: 44.299291133880615 seconds
SymbolicClassifier
Test accuracy: 0.7486 Test AUROC: 0.7536 Test F1 score: 0.6915
seed=90378

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.7541 Test AUROC: 0.8200 Test F1 score: 0.7343
DecisionTreeClassifier
Test accuracy: 0.7541 Test AUROC: 0.8313 Test F1 score: 0.6877
RandomForestClassifier
Test accuracy: 0.8011 Test AUROC: 0.9094 Test F1 score: 0.7647
XGBClassifier
Test accuracy: 0.8287 Test AUROC: 0.9243 Test F1 score: 0.8025
Time to fit symbolic classifier: 44.41036295890808 seconds
SymbolicClassifier
Test accuracy: 0.7597 Test AUROC: 0.7663 Test F1 score: 0.7031
Time to fit symbolic classifier: 46.336766958236694 seconds
SymbolicClassifier
Test accuracy: 0.7597 Test AUROC: 0.7671 Test F1 score: 0.7031

Process results
LENGTHS = {'SR': [], 'SRf': []}
for est in SRmodels:
LENGTHS['SR'].append(est._program.length_)
for est in SRfmodels:
LENGTHS['SRf'].append(est._program.length_)

LENGTHS['SR'] = np.asarray(LENGTHS['SR'])
LENGTHS['SRf'] = np.asarray(LENGTHS['SRf'])

convert_to_arrays()

plusminus = pm = u"\u00B1"
for model_name in MODEL_NAMES:
print(f"{model_name} acc: {ACCURACIES[model_name].mean():.2f} {pm}
{ACCURACIES[model_name].std():.4f}")
print('--------')
for model_name in MODEL_NAMES:
print(f"{model_name} F1: {F1SCORES[model_name].mean():.2f} {pm}
{F1SCORES[model_name].std():.4f}")

print('--------')

LR acc: 0.75 ± 0.0232

DT acc: 0.74 ± 0.0238
RF acc: 0.82 ± 0.0152
XG acc: 0.83 ± 0.0146
SR acc: 0.75 ± 0.0107
SRf acc: 0.75 ± 0.0145
--------
LR F1: 0.72 ± 0.0230
DT F1: 0.70 ± 0.0309
RF F1: 0.78 ± 0.0187
XG F1: 0.80 ± 0.0148
SR F1: 0.70 ± 0.0110
SRf F1: 0.69 ± 0.0166
--------
Compare average length of SR models with and without
special functions
print(f"SR mean length: {LENGTHS['SR'].mean()}")
print(f"SRf mean length: {LENGTHS['SRf'].mean()}")

SR mean length: 19.55

SRf mean length: 14.55

Save symbolic regression models for later use

from srmb.utils import save_sr_models, load_sr_models

save_sr_models(SRmodels, key='SR',
save_dir='../results_srmb/sr_vanilla_models/')
save_sr_models(SRfmodels, key='SRf',
save_dir='../results_srmb/sr_special_models/')

Load the models

SRMODELS = load_sr_models('SR',
save_dir='../results_srmb/sr_vanilla_models/')
SRFMODELS = load_sr_models('SRf',
save_dir='../results_srmb/sr_special_models/')

# SRMODELS

Ablation: what if we use the entire imbalanced

dataset?
# seeds for 20 trials
RANDOM_SEEDS_FOR_UNDERSAMPLING = [42, 2024, 1234, 2405, 11, 9345,
858, 8590, 4754, 1959,
707, 10524, 83946, 63297, 78035,
22664, 49283, 35253, 82273, 90378]
MODEL_NAMES = ['LR', 'DT', 'RF', 'XG', 'SR', 'SRf']
ACCURACIES = {k: [] for k in MODEL_NAMES}
F1SCORES = {k: [] for k in MODEL_NAMES}
AUCROCS = {k: [] for k in MODEL_NAMES}

SRmodels = []
SRfmodels = []

USE_BALANCED_SUBSAMPLE=False # perform undersampling of healthy

classes

for random_state in RANDOM_SEEDS_FOR_UNDERSAMPLING:

if not USE_BALANCED_SUBSAMPLE:
X_train, X_test, y_train, y_test = train_test_split(X1, y1,

test_size=0.25,
#
train_size=0.5, # if slow use this

random_state=42, stratify=y1b)
class_weight = compute_class_weight(class_weight='balanced',
classes=np.unique(y1b), y=y1b)

sample_weights = compute_sample_weight(class_weight='balanced',
y=y_train)

# logistic regression classifier

model_lr = LogisticRegression(max_iter=500,
class_weight=dict(enumerate(class_weight)))
model_lr.fit(X_train, y_train)
store_result('LR', *calculate_metrics(model_lr, X_train, y_train,
X_test, y_test))
# Decision tree classifier
model_dt = DecisionTreeClassifier(max_depth=5,
class_weight=dict(enumerate(class_weight)))
model_dt.fit(X_train, y_train)
store_result('DT', *calculate_metrics(model_dt, X_train, y_train,
X_test, y_test))

# Random forest classifier

# Create an XGBoost classifier for multiclass classification

# Do it for vanilla symbolic regression

function_set = ['add', 'sub', 'mul', 'div', 'neg', 'max', 'min',
'sqrt', 'log']

est = SymbolicClassifier(population_size=6000,
generations=20,
tournament_size=25,

init_depth=(2, 6),
const_range=(0., 100.),
# init_method="full",
parsimony_coefficient=0.001,
function_set=function_set,

stopping_criteria=1.0, metric=customacc,
#use custom acc as fitness

feature_names=X1.columns.to_list(),
# verbose=True,
random_state=42)

# SR with special functions

special_functions = [presence, absence, presence2, absence2,
ifelse]#, add3, add10]
function_set = ['add', 'sub', 'mul', 'div', 'neg', 'max', 'min'] +
special_functions

est = SymbolicClassifier(population_size=6000,
generations=20,
tournament_size=25,

init_depth=(2, 6),
const_range=(0., 100.),
# init_method="full",
parsimony_coefficient=0.001,
function_set=function_set,

stopping_criteria=1.0, metric=customacc,
#use custom acc as fitness

feature_names=X1.columns.to_list(),
# verbose=True,
random_state=42)

seed=42

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8467 Test AUROC: 0.8193 Test F1 score: 0.3380
DecisionTreeClassifier
Test accuracy: 0.8700 Test AUROC: 0.8443 Test F1 score: 0.3926
RandomForestClassifier
Test accuracy: 0.9490 Test AUROC: 0.9315 Test F1 score: 0.2526
XGBClassifier
Test accuracy: 0.9346 Test AUROC: 0.9426 Test F1 score: 0.5806
Time to fit symbolic classifier: 67.33232116699219 seconds
SymbolicClassifier
Test accuracy: 0.9411 Test AUROC: 0.6109 Test F1 score: 0.3223
Time to fit symbolic classifier: 65.69597911834717 seconds
SymbolicClassifier
Test accuracy: 0.9447 Test AUROC: 0.5897 Test F1 score: 0.2870
seed=2024

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8492 Test AUROC: 0.8519 Test F1 score: 0.3558
DecisionTreeClassifier
Test accuracy: 0.7745 Test AUROC: 0.8317 Test F1 score: 0.2991
RandomForestClassifier
Test accuracy: 0.9504 Test AUROC: 0.9401 Test F1 score: 0.3030
XGBClassifier
Test accuracy: 0.9318 Test AUROC: 0.9497 Test F1 score: 0.5662
Time to fit symbolic classifier: 66.04439783096313 seconds
SymbolicClassifier
Test accuracy: 0.9479 Test AUROC: 0.6489 Test F1 score: 0.4130
Time to fit symbolic classifier: 67.61290097236633 seconds
SymbolicClassifier
Test accuracy: 0.9483 Test AUROC: 0.6368 Test F1 score: 0.3950
seed=1234
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8492 Test AUROC: 0.8427 Test F1 score: 0.3458
DecisionTreeClassifier
Test accuracy: 0.8205 Test AUROC: 0.8374 Test F1 score: 0.3225
RandomForestClassifier
Test accuracy: 0.9508 Test AUROC: 0.9278 Test F1 score: 0.3184
XGBClassifier
Test accuracy: 0.9307 Test AUROC: 0.9266 Test F1 score: 0.5522
Time to fit symbolic classifier: 64.78528308868408 seconds
SymbolicClassifier
Test accuracy: 0.9436 Test AUROC: 0.6266 Test F1 score: 0.3592
Time to fit symbolic classifier: 63.20682096481323 seconds
SymbolicClassifier
Test accuracy: 0.9436 Test AUROC: 0.6266 Test F1 score: 0.3592
seed=2405

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8478 Test AUROC: 0.8632 Test F1 score: 0.3653
DecisionTreeClassifier
Test accuracy: 0.8014 Test AUROC: 0.8131 Test F1 score: 0.3061
RandomForestClassifier
Test accuracy: 0.9501 Test AUROC: 0.9250 Test F1 score: 0.2798
XGBClassifier
Test accuracy: 0.9278 Test AUROC: 0.9367 Test F1 score: 0.5483
Time to fit symbolic classifier: 67.03891324996948 seconds
SymbolicClassifier
Test accuracy: 0.9479 Test AUROC: 0.6488 Test F1 score: 0.4130
Time to fit symbolic classifier: 64.79723119735718 seconds
SymbolicClassifier
Test accuracy: 0.9479 Test AUROC: 0.6488 Test F1 score: 0.4130
seed=11

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8442 Test AUROC: 0.8298 Test F1 score: 0.3344
DecisionTreeClassifier
Test accuracy: 0.7874 Test AUROC: 0.8179 Test F1 score: 0.2936
RandomForestClassifier
Test accuracy: 0.9519 Test AUROC: 0.9456 Test F1 score: 0.3431
XGBClassifier
Test accuracy: 0.9314 Test AUROC: 0.9412 Test F1 score: 0.5708
Time to fit symbolic classifier: 68.0005111694336 seconds
SymbolicClassifier
Test accuracy: 0.9458 Test AUROC: 0.5620 Test F1 score: 0.2176
Time to fit symbolic classifier: 65.93679213523865 seconds
SymbolicClassifier
Test accuracy: 0.9504 Test AUROC: 0.6351 Test F1 score: 0.4000
seed=9345

Increase the number of iterations (max_iter) or scale the data as

LogisticRegression
Test accuracy: 0.8560 Test AUROC: 0.8329 Test F1 score: 0.3543
DecisionTreeClassifier
Test accuracy: 0.8039 Test AUROC: 0.8499 Test F1 score: 0.3175
RandomForestClassifier
Test accuracy: 0.9476 Test AUROC: 0.9272 Test F1 score: 0.2316
XGBClassifier
Test accuracy: 0.9332 Test AUROC: 0.9536 Test F1 score: 0.5830
Time to fit symbolic classifier: 63.743890047073364 seconds
SymbolicClassifier
Test accuracy: 0.9469 Test AUROC: 0.5684 Test F1 score: 0.2371
Time to fit symbolic classifier: 67.9835159778595 seconds
SymbolicClassifier
Test accuracy: 0.9472 Test AUROC: 0.6024 Test F1 score: 0.3226
seed=858

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8474 Test AUROC: 0.8279 Test F1 score: 0.3472
DecisionTreeClassifier
Test accuracy: 0.8126 Test AUROC: 0.7509 Test F1 score: 0.2965
RandomForestClassifier
Test accuracy: 0.9508 Test AUROC: 0.9173 Test F1 score: 0.3046
XGBClassifier
Test accuracy: 0.9364 Test AUROC: 0.9368 Test F1 score: 0.5755
Time to fit symbolic classifier: 65.58412313461304 seconds
SymbolicClassifier
Test accuracy: 0.9422 Test AUROC: 0.6175 Test F1 score: 0.3374
Time to fit symbolic classifier: 67.61177802085876 seconds
SymbolicClassifier
Test accuracy: 0.9422 Test AUROC: 0.6175 Test F1 score: 0.3374
seed=8590
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8445 Test AUROC: 0.8175 Test F1 score: 0.3224
DecisionTreeClassifier
Test accuracy: 0.7318 Test AUROC: 0.8034 Test F1 score: 0.2537
RandomForestClassifier
Test accuracy: 0.9479 Test AUROC: 0.9159 Test F1 score: 0.2408
XGBClassifier
Test accuracy: 0.9264 Test AUROC: 0.9206 Test F1 score: 0.5060
Time to fit symbolic classifier: 69.19223213195801 seconds
SymbolicClassifier
Test accuracy: 0.9429 Test AUROC: 0.5949 Test F1 score: 0.2933
Time to fit symbolic classifier: 73.34092903137207 seconds
SymbolicClassifier
Test accuracy: 0.9451 Test AUROC: 0.5956 Test F1 score: 0.3014
seed=4754

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8571 Test AUROC: 0.8440 Test F1 score: 0.3539
DecisionTreeClassifier
Test accuracy: 0.8226 Test AUROC: 0.8046 Test F1 score: 0.3270
RandomForestClassifier
Test accuracy: 0.9483 Test AUROC: 0.9139 Test F1 score: 0.2500
XGBClassifier
Test accuracy: 0.9329 Test AUROC: 0.9465 Test F1 score: 0.5600
Time to fit symbolic classifier: 68.17389011383057 seconds
SymbolicClassifier
Test accuracy: 0.9454 Test AUROC: 0.6299 Test F1 score: 0.3719
Time to fit symbolic classifier: 68.23464798927307 seconds
SymbolicClassifier
Test accuracy: 0.9476 Test AUROC: 0.5997 Test F1 score: 0.3178
seed=1959

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8524 Test AUROC: 0.8342 Test F1 score: 0.3568
DecisionTreeClassifier
Test accuracy: 0.8779 Test AUROC: 0.8453 Test F1 score: 0.4178
RandomForestClassifier
Test accuracy: 0.9494 Test AUROC: 0.9401 Test F1 score: 0.2694
XGBClassifier
Test accuracy: 0.9411 Test AUROC: 0.9409 Test F1 score: 0.6186
Time to fit symbolic classifier: 73.17940306663513 seconds
SymbolicClassifier
Test accuracy: 0.9483 Test AUROC: 0.6459 Test F1 score: 0.4098
Time to fit symbolic classifier: 68.82048106193542 seconds
SymbolicClassifier
Test accuracy: 0.9490 Test AUROC: 0.6202 Test F1 score: 0.3661
seed=707

Increase the number of iterations (max_iter) or scale the data as

LogisticRegression
Test accuracy: 0.8607 Test AUROC: 0.8348 Test F1 score: 0.3762
DecisionTreeClassifier
Test accuracy: 0.7415 Test AUROC: 0.8041 Test F1 score: 0.2608
RandomForestClassifier
Test accuracy: 0.9504 Test AUROC: 0.9217 Test F1 score: 0.3100
XGBClassifier
Test accuracy: 0.9278 Test AUROC: 0.9294 Test F1 score: 0.5421
Time to fit symbolic classifier: 69.46331691741943 seconds
SymbolicClassifier
Test accuracy: 0.9465 Test AUROC: 0.6451 Test F1 score: 0.4016
Time to fit symbolic classifier: 69.6081268787384 seconds
SymbolicClassifier
Test accuracy: 0.9497 Test AUROC: 0.6460 Test F1 score: 0.4167
seed=10524

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8467 Test AUROC: 0.8328 Test F1 score: 0.3421
DecisionTreeClassifier
Test accuracy: 0.7910 Test AUROC: 0.8272 Test F1 score: 0.3038
RandomForestClassifier
Test accuracy: 0.9487 Test AUROC: 0.9240 Test F1 score: 0.2741
XGBClassifier
Test accuracy: 0.9278 Test AUROC: 0.9417 Test F1 score: 0.5543
Time to fit symbolic classifier: 69.89840602874756 seconds
SymbolicClassifier
Test accuracy: 0.9476 Test AUROC: 0.6452 Test F1 score: 0.4065
Time to fit symbolic classifier: 74.91070580482483 seconds
SymbolicClassifier
Test accuracy: 0.9483 Test AUROC: 0.6312 Test F1 score: 0.3846
seed=83946
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8560 Test AUROC: 0.8386 Test F1 score: 0.3665
DecisionTreeClassifier
Test accuracy: 0.8086 Test AUROC: 0.8499 Test F1 score: 0.3227
RandomForestClassifier
Test accuracy: 0.9540 Test AUROC: 0.9249 Test F1 score: 0.3786
XGBClassifier
Test accuracy: 0.9203 Test AUROC: 0.9442 Test F1 score: 0.5216
Time to fit symbolic classifier: 69.23418998718262 seconds
SymbolicClassifier
Test accuracy: 0.9447 Test AUROC: 0.6325 Test F1 score: 0.3740
Time to fit symbolic classifier: 68.76166296005249 seconds
SymbolicClassifier
Test accuracy: 0.9447 Test AUROC: 0.6325 Test F1 score: 0.3740
seed=63297

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8481 Test AUROC: 0.8317 Test F1 score: 0.3542
DecisionTreeClassifier
Test accuracy: 0.8061 Test AUROC: 0.8330 Test F1 score: 0.3095
RandomForestClassifier
Test accuracy: 0.9490 Test AUROC: 0.9349 Test F1 score: 0.2680
XGBClassifier
Test accuracy: 0.9336 Test AUROC: 0.9520 Test F1 score: 0.5708
Time to fit symbolic classifier: 68.85216999053955 seconds
SymbolicClassifier
Test accuracy: 0.9454 Test AUROC: 0.6216 Test F1 score: 0.3559
Time to fit symbolic classifier: 68.83684873580933 seconds
SymbolicClassifier
Test accuracy: 0.9479 Test AUROC: 0.6112 Test F1 score: 0.3439
seed=78035

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8370 Test AUROC: 0.8402 Test F1 score: 0.3477
DecisionTreeClassifier
Test accuracy: 0.8456 Test AUROC: 0.8096 Test F1 score: 0.3302
RandomForestClassifier
Test accuracy: 0.9522 Test AUROC: 0.9357 Test F1 score: 0.3448
XGBClassifier
Test accuracy: 0.9268 Test AUROC: 0.9450 Test F1 score: 0.5565
Time to fit symbolic classifier: 75.26121234893799 seconds
SymbolicClassifier
Test accuracy: 0.9479 Test AUROC: 0.6688 Test F1 score: 0.4444
Time to fit symbolic classifier: 69.09115386009216 seconds
SymbolicClassifier
Test accuracy: 0.9501 Test AUROC: 0.6265 Test F1 score: 0.3822
seed=22664

Increase the number of iterations (max_iter) or scale the data as

LogisticRegression
Test accuracy: 0.8445 Test AUROC: 0.8247 Test F1 score: 0.3389
DecisionTreeClassifier
Test accuracy: 0.7372 Test AUROC: 0.8404 Test F1 score: 0.2591
RandomForestClassifier
Test accuracy: 0.9497 Test AUROC: 0.9347 Test F1 score: 0.2857
XGBClassifier
Test accuracy: 0.9314 Test AUROC: 0.9377 Test F1 score: 0.5649
Time to fit symbolic classifier: 69.37272310256958 seconds
SymbolicClassifier
Test accuracy: 0.9422 Test AUROC: 0.6203 Test F1 score: 0.3429
Time to fit symbolic classifier: 69.08472180366516 seconds
SymbolicClassifier
Test accuracy: 0.9422 Test AUROC: 0.6203 Test F1 score: 0.3429
seed=49283

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8718 Test AUROC: 0.8807 Test F1 score: 0.4138
DecisionTreeClassifier
Test accuracy: 0.7540 Test AUROC: 0.8027 Test F1 score: 0.2736
RandomForestClassifier
Test accuracy: 0.9522 Test AUROC: 0.9192 Test F1 score: 0.3383
XGBClassifier
Test accuracy: 0.9239 Test AUROC: 0.9314 Test F1 score: 0.5330
Time to fit symbolic classifier: 69.41814804077148 seconds
SymbolicClassifier
Test accuracy: 0.9472 Test AUROC: 0.6570 Test F1 score: 0.4235
Time to fit symbolic classifier: 68.55552625656128 seconds
SymbolicClassifier
Test accuracy: 0.9479 Test AUROC: 0.6225 Test F1 score: 0.3668
seed=35253
/Users/swagatam/miniconda3/envs/hiwi/lib/python3.10/site-packages/
sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs
failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8348 Test AUROC: 0.8110 Test F1 score: 0.3195
DecisionTreeClassifier
Test accuracy: 0.7907 Test AUROC: 0.8045 Test F1 score: 0.2899
RandomForestClassifier
Test accuracy: 0.9483 Test AUROC: 0.9151 Test F1 score: 0.2500
XGBClassifier
Test accuracy: 0.9293 Test AUROC: 0.9249 Test F1 score: 0.5343
Time to fit symbolic classifier: 75.61543703079224 seconds
SymbolicClassifier
Test accuracy: 0.9451 Test AUROC: 0.5476 Test F1 score: 0.1730
Time to fit symbolic classifier: 68.48590612411499 seconds
SymbolicClassifier
Test accuracy: 0.9461 Test AUROC: 0.5877 Test F1 score: 0.2857
seed=82273

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8531 Test AUROC: 0.8229 Test F1 score: 0.3599
DecisionTreeClassifier
Test accuracy: 0.8456 Test AUROC: 0.8416 Test F1 score: 0.3364
RandomForestClassifier
Test accuracy: 0.9476 Test AUROC: 0.9164 Test F1 score: 0.2551
XGBClassifier
Test accuracy: 0.9268 Test AUROC: 0.9287 Test F1 score: 0.5256
Time to fit symbolic classifier: 68.64090394973755 seconds
SymbolicClassifier
Test accuracy: 0.9422 Test AUROC: 0.6260 Test F1 score: 0.3534
Time to fit symbolic classifier: 68.7185320854187 seconds
SymbolicClassifier
Test accuracy: 0.9436 Test AUROC: 0.6005 Test F1 score: 0.3084
seed=90378

Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression
Test accuracy: 0.8657 Test AUROC: 0.8305 Test F1 score: 0.3746
DecisionTreeClassifier
Test accuracy: 0.7899 Test AUROC: 0.7964 Test F1 score: 0.2875
RandomForestClassifier
Test accuracy: 0.9490 Test AUROC: 0.9247 Test F1 score: 0.2680
XGBClassifier
Test accuracy: 0.9311 Test AUROC: 0.9397 Test F1 score: 0.5450
Time to fit symbolic classifier: 68.76493072509766 seconds
SymbolicClassifier
Test accuracy: 0.9465 Test AUROC: 0.6394 Test F1 score: 0.3918
Time to fit symbolic classifier: 67.91902613639832 seconds
SymbolicClassifier
Test accuracy: 0.9487 Test AUROC: 0.6257 Test F1 score: 0.3755

LENGTHS = {'SR': [], 'SRf': []}

for est in SRmodels:
LENGTHS['SR'].append(est._program.length_)
for est in SRfmodels:
LENGTHS['SRf'].append(est._program.length_)

LENGTHS['SR'] = np.asarray(LENGTHS['SR'])
LENGTHS['SRf'] = np.asarray(LENGTHS['SRf'])

convert_to_arrays()
plusminus = pm = u"\u00B1"
for model_name in MODEL_NAMES:
print(f"{model_name} acc: {ACCURACIES[model_name].mean():.2f} {pm}
{ACCURACIES[model_name].std():.4f}")
print('--------')
for model_name in MODEL_NAMES:
print(f"{model_name} F1: {F1SCORES[model_name].mean():.2f} {pm}
{F1SCORES[model_name].std():.4f}")

print('--------')

LR acc: 0.85 ± 0.0087

DT acc: 0.80 ± 0.0397
RF acc: 0.95 ± 0.0017
XG acc: 0.93 ± 0.0045
SR acc: 0.95 ± 0.0022
SRf acc: 0.95 ± 0.0025
--------
LR F1: 0.35 ± 0.0202
DT F1: 0.31 ± 0.0395
RF F1: 0.29 ± 0.0393
XG F1: 0.56 ± 0.0248
SR F1: 0.35 ± 0.0708
SRf F1: 0.35 ± 0.0393
--------

# LENGTHS['SR'].mean(), LENGTHS['SRf'].mean()

print(f"SR mean length: {LENGTHS['SR'].mean()}")

print(f"SRf mean length: {LENGTHS['SRf'].mean()}")

SR mean length: 1.6

SRf mean length: 4.0

Conclusion: From the results above we see that symbolic regression models have F1 score
below 0.5, i.e., they have very poor classification performance. Hence for a fairer comparison
among the classifiers, we do not use imbalanced data in our experiments.

Aiml Ex 4-7
No ratings yet
Aiml Ex 4-7
8 pages
16BCB0126 VL2018195002535 Pe003
No ratings yet
16BCB0126 VL2018195002535 Pe003
40 pages
ML Lab
No ratings yet
ML Lab
7 pages
Slip
No ratings yet
Slip
5 pages
I Avaliação Parcial - 25.0 PTS - Gabarito
No ratings yet
I Avaliação Parcial - 25.0 PTS - Gabarito
9 pages
ML Lab Manual
No ratings yet
ML Lab Manual
12 pages
All in One
No ratings yet
All in One
13 pages
Prakhar - Week 5
No ratings yet
Prakhar - Week 5
8 pages
Practicalpgm ML
No ratings yet
Practicalpgm ML
33 pages
LAB-4 Report
No ratings yet
LAB-4 Report
21 pages
AI ML - Cycle 2 Programs
No ratings yet
AI ML - Cycle 2 Programs
15 pages
MlLabManualdocx 2024 09 04 22 02 58
No ratings yet
MlLabManualdocx 2024 09 04 22 02 58
19 pages
Ai Int-1
No ratings yet
Ai Int-1
6 pages
Bacdeaf 23032025 115708 Split 1
No ratings yet
Bacdeaf 23032025 115708 Split 1
37 pages
1
No ratings yet
1
13 pages
ML Lab Codes
No ratings yet
ML Lab Codes
14 pages
ML - LAB - 7 - Jupyter Notebook
100% (1)
ML - LAB - 7 - Jupyter Notebook
7 pages
Assignment 5 - SourceCode - Ipynb - Colab
No ratings yet
Assignment 5 - SourceCode - Ipynb - Colab
4 pages
ML PDF
No ratings yet
ML PDF
30 pages
EX - NO:3: Algorithm
No ratings yet
EX - NO:3: Algorithm
11 pages
Final ML Programs 075005
No ratings yet
Final ML Programs 075005
15 pages
St. John College of Engineering and Management, Palghar - Maharashtra
No ratings yet
St. John College of Engineering and Management, Palghar - Maharashtra
11 pages
ML Journal External
No ratings yet
ML Journal External
14 pages
Shobit Sharma (2124399) ML Lab File PDF
No ratings yet
Shobit Sharma (2124399) ML Lab File PDF
19 pages
Programs Lab Bca
No ratings yet
Programs Lab Bca
16 pages
Unit2 ML Programs
No ratings yet
Unit2 ML Programs
7 pages
Amll
No ratings yet
Amll
1 page
Aiml Practicals
No ratings yet
Aiml Practicals
22 pages
Mlda - Lab
No ratings yet
Mlda - Lab
35 pages
Machine Learning Algorithms From Scratch
No ratings yet
Machine Learning Algorithms From Scratch
9 pages
Machine
100% (1)
Machine
45 pages
Additional Program
No ratings yet
Additional Program
573 pages
Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import From Import Import As
No ratings yet
Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import From Import Import As
8 pages
Aiml Lab
No ratings yet
Aiml Lab
14 pages
SPPUML5
No ratings yet
SPPUML5
4 pages
ML Lab P-1
No ratings yet
ML Lab P-1
10 pages
Aiml 5-8
No ratings yet
Aiml 5-8
19 pages
1 KNN - Jupyter Notebook
No ratings yet
1 KNN - Jupyter Notebook
3 pages
Allcodesml 2
No ratings yet
Allcodesml 2
10 pages
Deep Learning Perceptron
No ratings yet
Deep Learning Perceptron
10 pages
Data Mining Assignment No. 1
No ratings yet
Data Mining Assignment No. 1
7 pages
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
No ratings yet
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
8 pages
Program
No ratings yet
Program
19 pages
Ann Experiential Learning
No ratings yet
Ann Experiential Learning
43 pages
ML Lab 01999676272
No ratings yet
ML Lab 01999676272
12 pages
ML
No ratings yet
ML
11 pages
Code
No ratings yet
Code
5 pages
Da Lab Mannual
No ratings yet
Da Lab Mannual
25 pages
DWDM Lab 3
No ratings yet
DWDM Lab 3
10 pages
ML Lab PT
No ratings yet
ML Lab PT
25 pages
Logistic Regression
No ratings yet
Logistic Regression
3 pages
Minor Lab
No ratings yet
Minor Lab
4 pages
Lab-5 Report
No ratings yet
Lab-5 Report
11 pages
Dsbda 10
No ratings yet
Dsbda 10
5 pages
Untitled Document
No ratings yet
Untitled Document
6 pages
Assignment 3
No ratings yet
Assignment 3
9 pages
Wa0003
No ratings yet
Wa0003
16 pages
Classification Review
No ratings yet
Classification Review
8 pages
Setup: This Notebook Contains All The Sample Code and Solutions To The Exercises in Chapter 3
No ratings yet
Setup: This Notebook Contains All The Sample Code and Solutions To The Exercises in Chapter 3
30 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Unit2 Optimizer
No ratings yet
Unit2 Optimizer
18 pages
Numerical Optimization: Lecture Notes #18 Quasi-Newton Methods - The BFGS Method
No ratings yet
Numerical Optimization: Lecture Notes #18 Quasi-Newton Methods - The BFGS Method
24 pages
A Review of Closed-Loop Reservoir Management: Jian Hou Kang Zhou Xian-Song Zhang Xiao-Dong Kang Hai Xie
No ratings yet
A Review of Closed-Loop Reservoir Management: Jian Hou Kang Zhou Xian-Song Zhang Xiao-Dong Kang Hai Xie
15 pages
A Limited T,: Memory Algorithm For Bound Constrained T, T
No ratings yet
A Limited T,: Memory Algorithm For Bound Constrained T, T
19 pages
Shallow Parsing With Conditional Random Fields
No ratings yet
Shallow Parsing With Conditional Random Fields
8 pages
Adjoint Optimization: On State Constraint and Second Order Adjoint Computation
No ratings yet
Adjoint Optimization: On State Constraint and Second Order Adjoint Computation
50 pages
Inteligencia Preguntas
No ratings yet
Inteligencia Preguntas
7 pages
Greenplum Text Analytics
No ratings yet
Greenplum Text Analytics
5 pages
Wiki Lbfgs
No ratings yet
Wiki Lbfgs
6 pages
2019 Adversarial Examples in Modern Machine Learning - A Review
No ratings yet
2019 Adversarial Examples in Modern Machine Learning - A Review
97 pages
AuthorVersion PublishedTuningHyperparameters
No ratings yet
AuthorVersion PublishedTuningHyperparameters
30 pages
L-BFGS Algorithm
No ratings yet
L-BFGS Algorithm
4 pages
Cs294a 2011 Assignment
No ratings yet
Cs294a 2011 Assignment
5 pages
Bagging-Based Logistic Regression With Spark A Medical Data Mining Method
No ratings yet
Bagging-Based Logistic Regression With Spark A Medical Data Mining Method
7 pages
cs231n Training Neural Networks II
No ratings yet
cs231n Training Neural Networks II
99 pages
Modern Numerical Nonlinear Optimization (Neculai Andrei) PDF
No ratings yet
Modern Numerical Nonlinear Optimization (Neculai Andrei) PDF
824 pages
Lecture 3
No ratings yet
Lecture 3
105 pages
PolyCube Hex Meshing
No ratings yet
PolyCube Hex Meshing
14 pages
Soft Computing
No ratings yet
Soft Computing
39 pages
Energy Minimization
No ratings yet
Energy Minimization
2 pages
Deep Learning
No ratings yet
Deep Learning
4 pages
Unit 2 (Second Order Methods)
No ratings yet
Unit 2 (Second Order Methods)
9 pages
Linear Learning With Allreduce: John Langford (With Help From Many)
No ratings yet
Linear Learning With Allreduce: John Langford (With Help From Many)
33 pages
ML Week 3 Logistic Regression
60% (10)
ML Week 3 Logistic Regression
6 pages
TeraChem User Guide v1.0
No ratings yet
TeraChem User Guide v1.0
18 pages
Wolfram Mathematica Tutorial Collection
No ratings yet
Wolfram Mathematica Tutorial Collection
38 pages
Liu 1989
No ratings yet
Liu 1989
26 pages
Quasi Newton Methods
No ratings yet
Quasi Newton Methods
17 pages
Machine Learning - Home - Coursera Quiz PDF
100% (1)
Machine Learning - Home - Coursera Quiz PDF
5 pages
Survey TPAMI 2023 Preprint
No ratings yet
Survey TPAMI 2023 Preprint
20 pages