0% found this document useful (0 votes)
10 views4 pages

Notebook - Main Code

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views4 pages

Notebook - Main Code

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

import os

import argparse
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

def main():
"""Main function of the script."""

# input and output arguments


parser = argparse.ArgumentParser()
parser.add_argument("--data", type=str, help="path to input data")
parser.add_argument("--test_train_ratio", type=float, default=0.25)
parser.add_argument("--criterion", default="gini", type=str)
parser.add_argument("--random_state", type=int)
parser.add_argument("--
registered_model_name", type=str, help="model name")
args = parser.parse_args()

# Start Logging
mlflow.start_run()

# enable autologging
mlflow.sklearn.autolog()

###################
#<prepare the data>
###################
print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

print("input data:", args.data)


#kidney_df = pd.read_csv(args.data, header=1, index_col=0)
dataset = pd.read_csv(args.data)

mlflow.log_metric("num_samples", dataset.shape[0])
mlflow.log_metric("num_features", dataset.shape[1] - 1)

#----#
# removing extra '\t' from data
for ij in range (0,25):
if dataset.dtypes[ij]!='int64':
dataset.iloc[:,ij]=dataset.iloc[:,ij].astype('str').map(lambda x:
x.lstrip('\t').rstrip('\t'))

# Some additional data cleaning


dataset.replace(' yes','yes',inplace=True)
dataset['class'].replace('no','notckd',inplace=True)
dataset.replace('?',np.nan,inplace=True)

#For loop to do soft conversion of columns wherever possible leaving non-


object and unconvertible column unchanged.
cols = dataset.columns
for c in cols:
try:
dataset[c] = pd.to_numeric(dataset[c])
except:
pass

array=dataset.copy()

inputs=array.drop('class',axis='columns')
target=array['class']

#LabelEncoding
labelencoder=preprocessing.LabelEncoder()

X=inputs.values
#y=inputs.values
y=target

for ij in range(0,24):
X[:,ij]=labelencoder.fit_transform(X[:,ij])
#----#

#Split train and test datasets

#train_df, test_df = train_test_split(


X_train,X_test,y_train,y_test= train_test_split(
X,
y,
test_size=args.test_train_ratio,
random_state=args.random_state,
)
####################
#</prepare the data>
####################

##################
#<train the model>
##################
# Extracting the label column
#y_train = train_df.pop("24")

# convert the dataframe values to array


#X_train = train_df.values

# Extracting the label column


#y_test = test_df.pop("24")

# convert the dataframe values to array


#X_test = test_df.values

print(f"Training with data of shape {X_train.shape}")

decision_tree = DecisionTreeClassifier(
criterion=args.criterion,random_state = args.random_state
)
decision_tree.fit(X_train,y_train)

y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))
###################
#</train the model>
###################

##########################
#<save and register model>
##########################
# Registering the model to the workspace
print("Registering the model via MLFlow")
mlflow.sklearn.log_model(
sk_model=decision_tree,
registered_model_name=args.registered_model_name,
artifact_path=args.registered_model_name,
)

# Saving the model to a file


mlflow.sklearn.save_model(
sk_model=decision_tree,
path=os.path.join(args.registered_model_name, "trained_model"),
)
###########################
#</save and register model>
###########################

# Stop Logging
mlflow.end_run()

if __name__ == "__main__":
main()

You might also like