Notebook - Main Code
Notebook - Main Code
import argparse
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
def main():
"""Main function of the script."""
# Start Logging
mlflow.start_run()
# enable autologging
mlflow.sklearn.autolog()
###################
#<prepare the data>
###################
print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
mlflow.log_metric("num_samples", dataset.shape[0])
mlflow.log_metric("num_features", dataset.shape[1] - 1)
#----#
# removing extra '\t' from data
for ij in range (0,25):
if dataset.dtypes[ij]!='int64':
dataset.iloc[:,ij]=dataset.iloc[:,ij].astype('str').map(lambda x:
x.lstrip('\t').rstrip('\t'))
array=dataset.copy()
inputs=array.drop('class',axis='columns')
target=array['class']
#LabelEncoding
labelencoder=preprocessing.LabelEncoder()
X=inputs.values
#y=inputs.values
y=target
for ij in range(0,24):
X[:,ij]=labelencoder.fit_transform(X[:,ij])
#----#
##################
#<train the model>
##################
# Extracting the label column
#y_train = train_df.pop("24")
decision_tree = DecisionTreeClassifier(
criterion=args.criterion,random_state = args.random_state
)
decision_tree.fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)
print(classification_report(y_test, y_pred))
###################
#</train the model>
###################
##########################
#<save and register model>
##########################
# Registering the model to the workspace
print("Registering the model via MLFlow")
mlflow.sklearn.log_model(
sk_model=decision_tree,
registered_model_name=args.registered_model_name,
artifact_path=args.registered_model_name,
)
# Stop Logging
mlflow.end_run()
if __name__ == "__main__":
main()