HIV Regression Source Code
HIV Regression Source Code
In [1]:
import numpy as np
import pandas as pd
import os
class import_data():
'use class.data to get data'
def __init__(self):
while True:
self.path = input("Please input the path file (EX:...HIV Classificat
checkPath = os.path.isfile(self.path)
if checkPath == True:
break
self.data = None
self.data_csv()
def data_csv(self):
self.data = pd.read_csv(self.path)
display(self.data.head(2))
while True:
try:
col_drop = input("Please input the columns you want to drop? (En
if len(col_drop.strip()) == 0:
break
self.data = self.data.drop([col_drop], axis = 1)
except:
print("Error columns")
display(self.data.head(2))
In [4]:
# /Users/macbook/Documents/CH2020/Database Regression/HIV regression/Database fu
df = import_data()
data = df.data
pChEMBL
Name Smiles nAcid ALogP
Value
pChEMBL
nAcid ALogP ALogp2 AMR naAromAtom nAromBond nAtom nHeavyAtom
Value
class Data_cleaned:
def Duplicate_data(self):
#Duplicate rows:
dup_rows = self.data.duplicated()
print(f"Total duplicated rows: {(dup_rows == True).sum()}")
#Duplicate collumns:
self.data = self.data.T
dup_cols = self.data.duplicated()
print(f"Total similar columns: {(dup_cols == True).sum()}")
print("Data befor drop duplicates:", self.data.shape[0])
self.data.drop_duplicates(inplace = True)
self.data = self.data.T
print("Data after drop duplicates:", self.data.shape[1])
def Variance_Threshold(self):
X = self.data.values[:, 1:]
y = self.data.values[:, 0]
print(X.shape, y.shape)
def Missing_value_cleaning(self):
(self.data.isnull().sum()).sum()
print("Total missing value", (self.data.isnull().sum()).sum())
null_data = self.data[self.data.isnull().any(axis=1)]
display(null_data)
print("Total row with missing value", null_data.shape[0])
def Activate_Data_Cleaned(self):
self.Duplicate_data()
self.Variance_Threshold()
self.Low_variance_cleaning()
self.Missing_value_cleaning()
class noise_control(Data_cleaned):
def __init__(self, data):
self.data_0 = data
self.data = self.data_0.copy()
def feature_noise(self):
self.cols_remove = []
cols = []
while True:
feature_doub_1 = input("Please input 1st feature duplicated")
feature_doub_2 = input("Please input 2nd feature duplicated")
if feature_doub_1 and feature_doub_2 in self.data.columns:
cols.append(feature_doub_1)
cols.append(feature_doub_2)
self.cols_remove.append(feature_doub_1)
if len(feature_doub_1.strip()) == 0:
break
self.data_noise = self.data[cols]
def check_noise(self):
self.data_dif = pd.DataFrame()
for i in range(0, self.data_noise.shape[1]-1):
self.data_dif[f"{i}"] = self.data_noise.iloc[:, i+1] - self.
self.data_dif = self.data_dif.iloc[:, [i for i in range(0,self.data_dif
def check_index_noise(self):
self.idx = []
for i in range(0, self.data_dif.shape[1]):
def Activate_noise_control(self):
self.feature_noise()
self.check_noise()
self.check_index_noise()
class train_test_prepare(noise_control):
def Data_split(self):
self.df = self.data.copy()
while True:
self.RoC = input("Do you want to make classification?(Y/N)").title
if self.RoC == 'Y' or self.RoC == 'N':
break
if self.RoC.title() == "Y":
while True:
try:
self.thresh = float(input("Please input the threshold"))
break
except:
print("Error value!")
self.df1 = self.target_bin(thresh = self.thresh)
y = self.df1.iloc[:, 0].values
self.stratify = y
y = self.df1.iloc[:, 0].values
else:
self.stratify = None
y = self.df.iloc[:, 0].values
X = self.df.iloc[:, 1:].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
#index:
self.idx = self.df.T.index
#Train:
self.df_X_train = pd.DataFrame(X_train)
self.df_y_train = pd.DataFrame(y_train)
self.df_train = pd.concat([self.df_y_train, self.df_X_train], axis
self.df_a = self.df_train.T
self.df_a = self.df_a.reset_index(drop = True)
for i in range(0,self.idx.size):
self.df_a.rename(index ={i: self.idx[i]},inplace= True)
self.Data_train = self.df_a.T
#test
self.df_X_test = pd.DataFrame(X_test)
self.df_y_test = pd.DataFrame(y_test)
self.df_test = pd.concat([self.df_y_test, self.df_X_test], axis = 1
self.df_b = self.df_test.T
self.df_b = self.df_b.reset_index(drop = True)
for i in range(0,self.idx.size):
self.df_b.rename(index ={i: self.idx[i]},inplace= True)
self.Data_test = self.df_b.T
def Visualize_target(self):
if self.RoC.title() == "Y":
plt.figure(figsize = (16,5))
plt.subplot(1,2,1)
plt.hist(self.Data_train.iloc[:,0])
plt.title(f'Imbalanced ratio: {((self.Data_train.iloc[:,0].values
plt.subplot(1,2,2)
plt.hist(self.Data_test.iloc[:,0])
plt.title(f'Imbalanced ratio: {((self.Data_test.iloc[:,0].values
plt.show()
else:
plt.figure(figsize = (16,5))
plt.subplot(1,2,1)
plt.hist(self.Data_train.iloc[:,0])
plt.title(f'Train distribution')
plt.subplot(1,2,2)
plt.hist(self.Data_test.iloc[:,0])
plt.title(f'Test distribution')
plt.show()
def Nomial(self):
DF_1 = self.data_0.select_dtypes("int64")
DF_2 = DF_1.loc[:, (DF_1.nunique() <10).values & (DF_1.max() <10).values
idx2 = DF_2.T.index #select columns with int64
idx3 = self.Data_train.T.index #select all columns in data_train
idx4 = idx2.intersection(idx3) #idx4 are int64 cols in Data_train
self.Data_train[idx4]=self.Data_train[idx4].astype('int64') #set all id
self.Data_test[idx4]=self.Data_test[idx4].astype('int64')
if self.RoC == 'Y':
self.Data_train.iloc[:,0] = self.Data_train.iloc[:,0].astype('int64'
self.Data_test.iloc[:,0] = self.Data_test.iloc[:,0].astype('int64'
def Activate(self):
self.Activate_Data_Cleaned()
self.Activate_noise_control()
self.Data_split()
self.Visualize_target()
self.Nomial()
In [6]:
df=train_test_prepare(data)
df.Activate()
Error threshold!
Các cặp thông số mang ý nghĩa giống nhau nhưng không thể loại bỏ bằng
Duplicated_row! (Cần cập nhật để loại noise)
diameter : topoDiameter
radius : topoRadius
weinerPath : WPATH
weinerPol : WPOL
zagreb : Zagreb
In [7]:
Data_train = df.Data_train
Data_test = df.Data_test
def Check_quantity_features(self):
self.good = []
self.bad = []
self.df_train = self.data_train.copy()
self.df_test = self.data_test.copy()
for col_name in self.df_train.select_dtypes("float64").columns:
q1 = self.df_train[col_name].quantile(0.25)
q3 = self.df_train[col_name].quantile(0.75)
iqr = q3-q1
remove = self.data_train.shape[0] - (self.df_train[(self.df_train
if remove == 0:
self.good.append(col_name)
else:
self.bad.append(col_name)
print(f"Number of good features: {len(self.good)}")
print(f"Number of bad features with data remove > 0: {len(self.bad)
print("*"*75)
def Check_remove_outlier(self):
self.Check_remove_data()
self.Check_quantity_features()
def Outlier_Winsor(self):
print("Handling with Winsorization method")
self.df_train = self.data_train_0.copy()
self.df_test = self.data_test_0.copy()
for col_name in self.df_train.select_dtypes(include="float64").columns
q1 = self.df_train[col_name].quantile(0.25)
q3 = self.df_train[col_name].quantile(0.75)
iqr = q3-q1
self.df_train.loc[(self.df_train[col_name] <= (q1-1.5*iqr)), col_nam
self.df_train.loc[(self.df_train[col_name] >= (q3+1.5*iqr)), col_nam
#for test
self.df_test.loc[(self.df_test[col_name] <= (q1-1.5*iqr)), col_name
self.df_test.loc[(self.df_test[col_name] >= (q3+1.5*iqr)), col_name
self.data_train = self.df_train
self.data_test = self.df_test
self.Check_remove_outlier()
def Transformation(self):
self.df_train = self.data_train_0.copy()
self.df_test = self.data_test_0.copy()
#Train
while True:
try:
self.transformer = int(input("Please select type of transformati
break
except:
print("Error values! Input number!")
if self.transformer == 1:
self.scl =self.scl1
print("Handling with Transformation_Powertransformer method")
elif self.transformer == 2:
self.scl =self.scl2
print("Handling with Transformation_Gaussiantransformer method"
else:
self.scl =self.scl3
print("Handling with Transformation_Uniformtransformer method")
#Train
df_train_int = self.df_train.select_dtypes("int64")
df_train_int = df_train_int.reset_index(drop = True)
y_train = self.df_train.select_dtypes("float64").iloc[:,0].values
X_train = self.df_train.select_dtypes("float64").iloc[:,1:].values
self.scl.fit(X_train)
X_train_trans = self.scl.transform(X_train)
idx = self.df_train.select_dtypes("float64").T.index
df_X_train = pd.DataFrame(X_train_trans)
df_y_train = pd.DataFrame(y_train)
df_train = pd.concat([df_y_train, df_X_train], axis = 1)
df_a = df_train.T
df_a = df_a.reset_index(drop = True)
for i in range(0,idx.size):
df_a.rename(index ={i: idx[i]},inplace= True)
Data_train_float = df_a.T
self.data_train = pd.concat([Data_train_float , df_train_int], axis
#test
df_test_int = self.df_test.select_dtypes("int64")
df_test_int = df_test_int.reset_index(drop = True)
y_test = self.df_test.select_dtypes("float64").iloc[:,0].values
X_test = self.df_test.select_dtypes("float64").iloc[:,1:].values
X_test_trans = self.scl.transform(X_test)
idx = self.df_test.select_dtypes("float64").T.index
df_X_test = pd.DataFrame(X_test_trans)
df_y_test = pd.DataFrame(y_test)
df_test = pd.concat([df_y_test, df_X_test], axis = 1)
df_b = df_test.T
df_b = df_b.reset_index(drop = True)
for i in range(0,idx.size):
df_b.rename(index ={i: idx[i]},inplace= True)
Data_test_float = df_b.T
self.data_test = pd.concat([Data_test_float , df_test_int], axis =
self.Check_remove_outlier()
input_point = input("Do you want to use KBin method for this Transformat
point = input_point.title()
if point == "Y":
self.KBin()
else:
pass
self.data_train_clean = pd.concat([self.data_train_good,self.bad_new
self.data_train = self.data_train_clean
#test
self.data_test_int = self.data_test.select_dtypes('int64')
self.data_test_good = self.data_test[self.good]
self.data_test_bad = self.data_test[self.bad]
self.bad_new = pd.DataFrame(kst.transform(self.data_test_bad)).astype
self.data_test_clean = pd.concat([self.data_test_good,self.bad_new,
self.data_test = self.data_test_clean
self.Check_remove_outlier()
def Activate_Check(self):
print('remove by IQR without handling')
self.Check_remove_outlier()
self.Outlier_Winsor()
self.Transformation()
In [10]:
df1 = Check_Univariate_outlier(Data_train, Data_test)
df1.Check_remove_outlier()
df1.Activate_Check()
Total data remove on Train 1399
Total data remove on Test 351
Number of good features: 73
Number of bad features with data remove > 0: 673
***************************************************************************
remove by IQR without handling
Total data remove on Train 1399
Total data remove on Test 351
Number of good features: 73
Number of bad features with data remove > 0: 673
***************************************************************************
Handling with Winsorization method
Total data remove on Train 0
Total data remove on Test 0
Number of good features: 746
Number of bad features with data remove > 0: 0
***************************************************************************
In [11]:
Data_train = df1.data_train
Data_test = df1.data_test
In [13]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
def LOF(self):
self.data_train_LOF = self.data_train_0.copy()
self.data_test_LOF = self.data_test_0.copy()
while True:
try:
self.n_neighbors = int(input("Please input number of neighbors f
break
except:
print("Error values!")
LOF = LocalOutlierFactor(n_neighbors = self.n_neighbors)
LOF.fit(self.data_train_LOF)
self.Outlier_LOF = self.data_train_LOF[LOF.fit_predict(self.data_train_L
self.Data_train_LOF = self.data_train_LOF[LOF.fit_predict(self.data_trai
print(f"Total outlier remove by LOF:", self.Outlier_LOF.shape[0])
#Test
LOF = LocalOutlierFactor(n_neighbors = self.n_neighbors, novelty =
LOF.fit(self.data_train_LOF)
self.Data_test_LOF = self.data_test_LOF[LOF.predict(self.data_test_LOF
def Ist_for(self):
self.data_train_Ist_for = self.data_train_0.copy()
self.data_test_Ist_for = self.data_test_0.copy()
while True:
try:
self.n_estimators = int(input("Please input number of estimators
self.contamination = float(input("Please input number of contami
break
except:
print("Error values!")
Iso_for = IsolationForest(n_estimators=self.n_estimators, contamination
Iso_for.fit(self.data_train_Ist_for)
self.Outlier_iso = self.data_train_Ist_for[Iso_for.predict(self.data_tra
self.Data_train_iso = self.data_train_Ist_for[Iso_for.predict(self.
self.Data_test_iso = self.data_test_Ist_for[Iso_for.predict(self.data_te
print(f"Total outlier remove by Isolation forest:", self.Outlier_iso
def o_SVM(self):
self.data_train_o_SVM = self.data_train_0.copy()
self.data_test_o_SVM = self.data_test_0.copy()
o_SVM = OneClassSVM()
o_SVM.fit(self.data_train_o_SVM)
self.Outlier_osvm = self.data_train_o_SVM[o_SVM.predict(self.data_train_
self.Data_train_osvm = self.data_train_o_SVM[o_SVM.predict(self.data_tra
self.Data_test_osvm = self.data_test_o_SVM[o_SVM.predict(self.data_test_
print(f"Total outlier remove by One Class SVM:", self.Outlier_osvm.
def robust_cov(self):
self.data_train_r_cov = self.data_train_0.copy()
self.data_test_r_cov = self.data_test_0.copy()
while True:
try:
self.contamination = float(input("Please input number of contami
break
except:
print("Error values!")
robust_cov = EllipticEnvelope(contamination= self.contamination)
robust_cov.fit(self.data_train_r_cov)
self.Outlier_rcov = self.data_train_r_cov[robust_cov.predict(self.data_t
self.Data_train_rcov = self.data_train_r_cov[robust_cov.predict(self
self.Data_test_rcov = self.data_test_r_cov[robust_cov.predict(self.
print(f"Total outlier remove by Robust covariance:", self.Outlier_rcov
def emp_cov(self):
self.data_train_e_cov = self.data_train_0.copy()
self.data_test_e_cov = self.data_test_0.copy()
while True:
try:
self.contamination = float(input("Please input number of contami
self.support_fraction = float(input("Please input number of supp
break
except:
print("Error values!")
emp_cov = EllipticEnvelope(contamination= self.contamination, support_fr
emp_cov.fit(self.data_train_e_cov)
self.Outlier_ecov = self.data_train_e_cov[emp_cov.predict(self.data_trai
self.Data_train_ecov = self.data_train_e_cov[emp_cov.predict(self.data_t
self.Data_test_ecov = self.data_test_e_cov[emp_cov.predict(self.data_tes
print(f"Total outlier remove by Emperical covariance:", self.Outlier_eco
def Visualize_Outlier(self):
self.LOF()
self.Ist_for()
self.o_SVM()
self.robust_cov()
self.emp_cov()
Models = [('Local Outlier Factor', self.Outlier_LOF.shape[0]), ('Isolat
('One Class SVM', self.Outlier_osvm.shape[0]),('Robust covaria
for name, N_out in Models:
plt.rcParams["figure.figsize"] = (20,8)
plt.bar(name,N_out)
def Mutivariate_Outlier_Handling(self):
while True:
try:
algo = input("Please select algorithm for multivariate method:
break
except:
print("Wrong! Please input number from 1-5.")
if algo == 1:
self.LOF()
elif algo == 2:
self.Ist_for()
elif algo == 3:
self.o_SVM()
elif algo == 4:
self.robust_cov()
elif algo == 5:
self.emp_cov()
else:
self.Mutivariate_Outlier_Handling()
In [ ]:
df4= Mutivariate(Data_train, Data_test)
df4.Visualize_Outlier()
In [14]:
df4= Mutivariate(Data_train, Data_test)
df4.LOF()
In [15]:
Data_train = df4.Data_train_LOF
Data_test = df4.Data_test_LOF
Module 4: Rescale
In [16]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
class rescale(Mutivariate):
def __init__(self, data_train, data_test):
self.data_train_0 = data_train
self.data_test_0 = data_test
self.scl1 = MinMaxScaler()
self.scl2 = StandardScaler()
self.scl3 = RobustScaler()
def rescale_fit(self):
self.data_train = self.data_train_0.copy()
self.data_test = self.data_test_0.copy()
while True:
try:
self.transformer = int(input("Please select type of transformati
break
except:
print("Error value")
if self.transformer == 1:
self.scl =self.scl1
elif self.transformer == 2:
self.scl =self.scl2
else:
self.scl =self.scl3
#Train
df_train_int = self.data_train.select_dtypes("int64")
df_train_int = df_train_int.reset_index(drop = True)
y_train = self.data_train.select_dtypes("float64").iloc[:,0].values
X_train = self.data_train.select_dtypes("float64").iloc[:,1:].values
self.scl.fit(X_train)
X_train_trans = self.scl.transform(X_train)
idx = self.data_train.select_dtypes("float64").T.index
df_X_train = pd.DataFrame(X_train_trans)
df_y_train = pd.DataFrame(y_train)
df_train = pd.concat([df_y_train, df_X_train], axis = 1)
df_a = df_train.T
df_a = df_a.reset_index(drop = True)
for i in range(0,idx.size):
df_a.rename(index ={i: idx[i]},inplace= True)
Data_train_float = df_a.T
self.Data_train = pd.concat([Data_train_float , df_train_int], axis
#Test
df_test_int = self.data_test.select_dtypes("int64")
df_test_int = df_test_int.reset_index(drop = True)
y_test = self.data_test.select_dtypes("float64").iloc[:,0].values
X_test = self.data_test.select_dtypes("float64").iloc[:,1:].values
X_test_trans = self.scl.transform(X_test)
idx = self.data_test.select_dtypes("float64").T.index
df_X_test = pd.DataFrame(X_test_trans)
df_y_test = pd.DataFrame(y_test)
df_test = pd.concat([df_y_test, df_X_test], axis = 1)
df_b = df_test.T
df_b = df_b.reset_index(drop = True)
for i in range(0,idx.size):
df_b.rename(index ={i: idx[i]},inplace= True)
Data_test_float = df_b.T
self.Data_test = pd.concat([Data_test_float , df_test_int], axis =
In [17]:
df5 = rescale(Data_train, Data_test)
df5.rescale_fit()
In [18]:
df5.Data_train.head(2)
Out[18]: pChEMBL
ALogP ALogp2 AMR naAromAtom nAromBond nAtom nHeavyAtom
Value
In [19]:
Data_train = df5.Data_train
Data_test = df5.Data_test
In [20]:
X_train = Data_train.iloc[:,1:].values
y_train = Data_train.iloc[:,0].values
X_test = Data_test.iloc[:,1:].values
y_test = Data_test.iloc[:,0].values
In [28]:
import matplotlib.pyplot as plt
class feature_selection:
def __init__(self, data_train, data_test):
self.X_train = data_train.iloc[:,1:].values
self.y_train = data_train.iloc[:,0].values
self.X_test = data_test.iloc[:,1:].values
self.y_test = data_test.iloc[:,0].values
self.result = list()
self.name = list()
def random_forest(self):
forest = RandomForestRegressor(random_state=42)
forest.fit(self.X_train, self.y_train)
model_RF = SelectFromModel(forest, prefit=True)
self.X_train_new = model_RF.transform(self.X_train)
self.X_test_new = model_RF.transform(self.X_test)
self.name.append("Random Forest")
self.check_intenal_performance()
def extra_tree(self):
ext_tree = ExtraTreesRegressor(random_state=42)
ext_tree.fit(self.X_train, self.y_train)
model_ext_tree = SelectFromModel(ext_tree, prefit=True)
self.X_train_new = model_ext_tree.transform(self.X_train)
self.X_test_new = model_ext_tree.transform(self.X_test)
self.name.append("ExtraTree")
self.check_intenal_performance()
def ada(self):
ada = AdaBoostRegressor(random_state=42)
ada.fit(self.X_train, self.y_train)
model_ada = SelectFromModel(ada, prefit=True)
self.X_train_new = model_ada.transform(self.X_train)
self.X_test_new = model_ada.transform(self.X_test)
self.name.append("AdaBoost")
self.check_intenal_performance()
def grad(self):
grad = GradientBoostingRegressor(random_state=42)
grad.fit(self.X_train, self.y_train)
model_grad = SelectFromModel(grad, prefit=True)
self.X_train_new = model_grad.transform(self.X_train)
self.X_test_new = model_grad.transform(self.X_test)
self.name.append("GradientBoost")
self.check_intenal_performance()
def XGb(self):
XGb = XGBRegressor(random_state=42)
XGb.fit(self.X_train, self.y_train)
model_XGb = SelectFromModel(XGb, prefit=True)
self.X_train_new = model_XGb.transform(self.X_train)
self.X_test_new = model_XGb.transform(self.X_test)
self.name.append("XGBoost")
self.check_intenal_performance()
def Lasso(self):
lasso = LassoCV(random_state = 42)
lasso.fit(self.X_train, self.y_train)
model_lasso = SelectFromModel(lasso, prefit=True)
self.X_train_new = model_lasso.transform(self.X_train)
self.X_test_new = model_lasso.transform(self.X_test)
self.name.append("lasso")
self.check_intenal_performance()
def ELN(self):
ELN = ElasticNetCV(random_state = 42)
ELN.fit(self.X_train, self.y_train)
model_ELN = SelectFromModel(ELN, prefit=True)
self.X_train_new = model_ELN.transform(self.X_train)
self.X_test_new = model_ELN.transform(self.X_test)
self.name.append("ElasticNet")
self.check_intenal_performance()
def feature_importance(self):
model = RandomForestRegressor(random_state = 42)
model.fit(self.X_train, self.y_train)
importance = model.feature_importances_
while True:
threshold = float(input("Select features importances threshold"
print("The remain features = ", (importance > threshold).sum())
action = input("Do you want to check another threshold?(Y/N)")
if action.title() == 'N':
break
self.X_train_new=self.X_train[:,importance > threshold]
self.X_test_new= self.X_test[:,importance > threshold]
self.name.append("Feature Importance")
self.check_intenal_performance()
def check_performance(self):
forest_model = RandomForestRegressor(random_state=42)
forest_model.fit(self.X_train_new, self.y_train)
def check_intenal_performance(self):
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
in_model = RandomForestRegressor(random_state=42)
score_internal = cross_val_score(in_model, self.X_train_new, self.y_trai
print(score_internal.mean())
self.result.append(score_internal)
def model_feature_selection(self):
while True:
try:
models = int(input("Please select algorithm for feature selectio
break
except:
print("\nWrong values! Input number from 1-5!")
if models == 1:
self.random_forest()
elif models == 2:
self.extra_tree()
elif models ==3:
self.ada()
elif models == 4:
self.grad()
elif models == 5:
self.XGb()
elif models == 6:
self.feature_importance()
else:
self.model_feature_selection()
def compare_model(self):
fig =plt.figure(figsize = (20,8))
self.result = list()
self.name = list()
self.random_forest()
self.extra_tree()
self.ada()
self.grad()
self.XGb()
self.Lasso()
self.ELN()
self.feature_importance()
In [29]:
Descriptor_select = feature_selection(Data_train, Data_test)
Descriptor_select.compare_model()
-0.7358630025248843
-0.7281728504218296
-0.745356557436394
-0.7261001611653198
-0.7343649689156617
-0.7552536490535624
-0.7507194810054509
-0.7360665483457994
In [31]:
# Use Anova test to choose feature selection method
d = pd.DataFrame(Descriptor_select.result)
idx = Descriptor_select.name
for i in range(0,len(idx)):
d.rename(index ={i: idx[i]},inplace= True)
check_result = d.T
Trong đó có 3pp:
Extra Tree
XGboost
ElasticNet CV
15 fold đều cho kết quả không lệch quá nhiều Chọn KQ tốt nhất là Extra Tree
In [35]:
Descriptor_select = feature_selection(Data_train, Data_test)
Descriptor_select.extra_tree()
-0.7281728504218296
RFE METHOD # evaluate RFE for regression from numpy import mean from numpy import std from
sklearn.datasets import make_regression from sklearn.model_selection import cross_val_score from
sklearn.model_selection import RepeatedKFold from sklearn.feature_selection import RFECV from
sklearn.pipeline import Pipeline # create pipeline rfe =
RFECV(estimator=RandomForestRegressor(random_state=42)) model =
RandomForestRegressor(random_state=42) pipeline = Pipeline(steps=[('s',rfe),('m',model)]) # evaluate
model cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42) n_scores =
cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1,
error_score='raise') # report performance print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
In [37]:
X_train = Descriptor_select.X_train_new
X_test = Descriptor_select.X_test_new
y_train = Descriptor_select.y_train
y_test = Descriptor_select.y_test
1. Auto Model
In [ ]:
from Auto_ML.Auto_ML_HHC import LabHHCRegressor
reg = LabHHCRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models
In [114…
models
Model
def model(self):
self.regressors = [('Linear Regression', self.lr),('Ridge Regression'
('Decision Tree', self.dt), ('Random Forest', self.rf), ('AdaBoos
('Gradient Boosting Regressor', self.gbr), ('XGBoost', self.xgb
def Report_metrics(self):
self.P_train =self.estimator.predict(self.X_train)
self.P_test =self.estimator.predict(self.X_test)
if self.create_df==True:
r2_train = r2_score(self.y_train,self.P_train)
r2_test = r2_score(self.y_test,self.P_test)
r_squared_train = (1 - (1-r2_train) * ((self.X_train.shape[0]-1
r_squared_test = (1 - (1-r2_test) * ((self.X_test.shape[0]-1) /
#train
self.metrics_df["Estimator Name"]= self.name
df_compared_train = self.metrics_df.drop(['Test', "Estimator Name"
df_compared_train = df_compared_train.rename(columns ={'Train':
self.df_compare_train = self.df_compare_train.append(df_compared_tra
#test
self.metrics_df["Estimator Name"]= self.name
df_compared_test = self.metrics_df.drop(['Train', "Estimator Name"
df_compared_test = df_compared_test.rename(columns ={'Test': self
self.df_compare_test = self.df_compare_test.append(df_compared_test
else:
self.metrics_df ="File not created"
def Visualize_report(self):
while True:
try:
metric = int(input("Which metric do you want to visualize?\n\t
break
except:
print("Wrong metric! Please input number!")
for self.name, self.regressor in self.regressors:
# Predict
y_pred = self.regressor.predict(X_test)
plt.rcParams["figure.figsize"] = (36,15)
ax = plt.bar(self.name,result)
plt.ylabel(name)
plt.xlabel("Algorithm")
plt.title(f"{name} compare", size = 20)
for p in ax.patches:
x = p.get_x()+ (p.get_width()/3)
y = p.get_height()+0.05
plt.text(x, y, round(result,3), fontsize=15)
In [112…
auto_models = Regression_report(X_train, X_test, y_train, y_test, create_df
auto_models.model()
In [113…
auto_models.Visualize_report()
In [115…
auto_models.df_compare_test
Partial Least Squares 0.59 0.36 0.81 0.64 0.52 0.11 347.00
Support vector
0.71 0.56 0.68 0.50 0.39 0.09 347.00
machine
Gradient Boosting
0.70 0.53 0.70 0.54 0.44 0.09 347.00
Regressor
Partial Least Squares 0.68 0.65 0.76 0.59 0.50 0.10 1376.00
Support vector
0.84 0.83 0.53 0.35 0.18 0.06 1376.00
machine
Gradient Boosting
0.84 0.83 0.53 0.41 0.33 0.07 1376.00
Regressor
Tunning SVM
In [117…
models = SVR()
In [141…
from sklearn.model_selection import GridSearchCV
cv = RepeatedKFold(5,3)
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel':
grid = GridSearchCV(models,param_grid,refit=True,verbose=1, cv = cv)
grid.fit(X_train,y_train)
In [143…
print(grid.best_estimator_.C)
print(grid.best_estimator_.gamma)
print(grid.best_estimator_.kernel)
1
0.01
rbf
In [144…
#Before
svr = SVR()
svr.fit(X_train, y_train)
RMSE = mean_squared_error(y_test, svr.predict(X_test), squared = False)
MAPE = mean_absolute_percentage_error(y_test, svr.predict(X_test))
MAPE*100
Out[144… 8.547101079121475
In [145…
#After tunning
model_tuning = SVR(kernel = 'rbf', gamma = 0.01, C = 1)
model_tuning.fit(X_train, y_train)
RMSE = mean_squared_error(y_test, model_tuning.predict(X_test), squared = False
MAPE = mean_absolute_percentage_error(y_test, model_tuning.predict(X_test))
MAPE*100
Out[145… 8.449803139071971
In [ ]: