New Text Document
New Text Document
def exhaustive_search_reg(dep_data=odr_data,
inp_data=mev_output_data,
exp_sign_data=MEV_Additional_Information,
max_no_ind_var=2,
sig_lvl=.05,
dw_thld=[1.5,2.5],
vif_thld=2,
adj_r_sq_thld=0.5,
no_of_kfold=2):
"""
Parameters
----------
dep_data : dataframe, Mandatory
DESCRIPTION. The default is odr_data.
inp_data : dataframe, Mandatory
DESCRIPTION. The default is mev_output_data.
MEV_Additional_Information : dataframe, Mandatory
DESCRIPTION. The default is MEV_Additional_Information.
max_no_ind_var : int, optional
DESCRIPTION. The default is 2.restrict the maximum no of factors in the
model.
sig_lvl : float, optional
DESCRIPTION. The default is .05.
dw_thld : list with two numeric element, optional
DESCRIPTION. The default is [1.5,2.5].
vif_thld : int, optional
DESCRIPTION. The default is 2.
adj_r_sq_thld : float, optional
DESCRIPTION. The default is 0.5.
no_of_kfold : int, optional
DESCRIPTION. The default is 2.
Returns
-------
dataframe.
which contaion candidate models from ols regression
"""
reg_data=pd.merge(dep_data,inp_data,how="inner",on="date").dropna(axis=1)
y=reg_data['odr']
## transform dep var in log odd space
y=np.log(y/(1-y))
reg_ind_set=reg_data.drop(['date','odr'],axis=1)
## create combination of variable upto max_no_of factor
var_list=[]
for max_comb in range(1,max_no_ind_var+1):
comb=combinations(reg_ind_set.columns,max_comb)
var_list.append([i for i in comb])
var_list1=[element for innerList in var_list for element in innerList]
## fetching full list of model
final_reg_output_data_full=pd.DataFrame()
id_cnt_full=1
for sel_var in var_list1:
x_var=reg_ind_set.loc[:,sel_var].reset_index(drop=True) # Changed on
12Mar2024
y = y.reset_index(drop=True) # Changed on 12Mar2024
## add constant
x = sm.add_constant(x_var, has_constant='add')
## filling regression
results1 = sm.OLS(y,x).fit()
temp1=pd.DataFrame(results1.summary2().tables[1])
temp1=temp1.iloc[:,0:4]
temp1.columns=["coefficient","standard_error","t_value","p_value"]
## for full list of model fetch
temp1.loc[:,"p_val_jarque_bera_test"]=results1.summary2().tables[2].loc[results1.su
mmary2().tables[2][2]=='Prob(JB):',3][2]
temp1.loc[:,"p_val_breusch_pagan_test"]=sms.het_breuschpagan(results1.resid,
results1.model.exog)[1]
temp1.loc[:,"test_stat_durbin_watson"]=results1.summary2().tables[2].loc[results1.s
ummary2().tables[2][2]=='Durbin-Watson:',3][0]
temp1.loc[:,"p_val_adf_test"]=adfuller(results1.resid)[1]
temp1.loc[:,"p_val_kpss_test"]=kpss(results1.resid)[1]
temp1.loc[:,"p_val_rainbow_test"]= linear_rainbow(results1)[1]
temp1.loc[:,"vif"]=[variance_inflation_factor(x.values, i) for i in
range(len(x.columns))]
temp1.loc[:,"adj_r_squared"]=results1.rsquared_adj
temp1.loc[:,"aic_is"]=results1.aic # Changed on 12Mar2024
temp1.loc[:,"bic_is"]=results1.bic # Changed on 12Mar2024
## calculate insample rmse and mape of dev sample of the model
y_pred=results1.predict(x)
temp1.loc[:,"dev_mape_is"]= mean_absolute_percentage_error(y, y_pred) #
Changed on 12Mar2024
temp1.loc[:,"dev_rmse_is"]=np.sqrt(metrics.mean_squared_error(y, y_pred)) #
Changed on 12Mar2024
###################################################################################
############################################################
## Define Train and Test Sample for Validation # Changed on 12Mar2024
# Independent
x_var_train1 = x_var.loc[0:(x_var.shape[0]-4)-1,:]
x_var_test1 = x_var.loc[(x_var.shape[0]-4):(x_var.shape[0]),:]
# Dependent
y_train1 = y[0:len(y)-4]
y_test1 = y[len(y)-4:len(y)]
## add constant
x_val1 = sm.add_constant(x_var_train1, has_constant='add')
x_test1 = sm.add_constant(x_var_test1, has_constant='add')
## filling regression
results2 = sm.OLS(y_train1,x_val1).fit()
score=cross_val_score(SMWrapper(sm.OLS),x,y,cv=kf,scoring='neg_root_mean_squared_er
ror')
## calculate model kfold avg score
temp1.loc[:,"kfold_score"]=score.mean()
temp1.reset_index(inplace=True)
temp1.rename(columns={'index': 'variable'},inplace=True)
## creatinh unique model id
temp1.insert(loc=0, column='model_id', value=id_cnt_full)
## creating variable for no of factor in the model
temp1.insert(loc=1, column='no_of_factor', value=len(sel_var))
id_cnt_full+=1
final_reg_output_data_full=pd.concat([final_reg_output_data_full,temp1])
## selecting one from each group.suppose from gdp group may be many
transformation but in model only one from gdp group will be selected
var_list2=[tuple(vars[tp].split("_")[0] for tp in range(len(vars))) for vars in
var_list1]
diffr=[ len(set(vars))-len(vars) for vars in var_list2 ]
var_data=pd.DataFrame({'var_list1':var_list1,'var_list2':var_list1,'diff':diffr})
var_list3=list(var_data.loc[var_data['diff']==0,'var_list1'])
var_list3 = var_list1.copy()
## running reg in loop
final_reg_output_data=pd.DataFrame()
id_cnt=1
for sel_var in var_list3:
x_var=reg_ind_set.loc[:,sel_var].reset_index(drop=True) # Changed on
12Mar2024
## add constant
x = sm.add_constant(x_var)
## filling regression
results = sm.OLS(y,x).fit()
temp=pd.DataFrame(results.summary2().tables[1])
temp=temp.iloc[:,0:4]
temp.columns=["coefficient","standard_error","t_value","p_value"]
## pvalue test for significance of model variables
temp[temp.index != 'const']['p_value']
if all(temp[temp.index != 'const']['p_value'] <= sig_lvl): # Changed on
26th Feb
## expected sign check
sign_chk_data=pd.DataFrame({'act_sign':np.sign(temp.iloc[1:,0])})
sign_chk_data.reset_index(inplace=True)
sign_chk_data["mev"]=[text.split("_")[0] for text in
sign_chk_data['index']]
sign_chk_data1=pd.merge(sign_chk_data,MEV_Additional_Information,how="left",on='mev
')
## expected sign test wheter expected sign match with actual sign
if (all(sign_chk_data1['act_sign']==sign_chk_data1['exp_sign'])) :
# Change - added if condition '| (all(sign_chk_data1['act_sign']!
=sign_chk_data1['exp_sign']))' to bypass this check
# Change - 26th Feb - from all to any in if condition
if flag_applycheck == True:
temp.loc[:,"p_val_jarque_bera_test"]=results.summary2().tables[2].loc[results.summa
ry2().tables[2][2]=='Prob(JB):',3][2]
## normality test for regression
if any(temp['p_val_jarque_bera_test'].astype(float) >=
sig_lvl):
temp.loc[:,"p_val_breusch_pagan_test"]=sms.het_breuschpagan(results.resid,
results.model.exog)[1]
## homoscedasticity test for regression
if any(temp['p_val_breusch_pagan_test'].astype(float) >=
sig_lvl):
temp.loc[:,"test_stat_durbin_watson"]=results.summary2().tables[2].loc[results.summ
ary2().tables[2][2]=='Durbin-Watson:',3][0]
## autocorrelation test for regression
if (any(temp['test_stat_durbin_watson'].astype(float)
>= dw_thld[0])) | (any(temp['test_stat_durbin_watson'].astype(float) <=
dw_thld[1])):
temp.loc[:,"p_val_adf_test"]=adfuller(results.resid)[1]
temp.loc[:,"p_val_kpss_test"]=kpss(results.resid)
[1]
## stationarity test for regression
#if (any(temp['p_val_adf_test'].astype(float) >
sig_lvl)) | (any(temp['p_val_kpss_test'].astype(float) <= sig_lvl)):
temp.loc[:,"p_val_rainbow_test"]=
linear_rainbow(results)[1]
if (any(temp['p_val_rainbow_test'].astype(float)
>= sig_lvl)):
temp.loc[:,"vif"]=[variance_inflation_factor(x.values, i) for i in
range(len(x.columns))]
## multicollinearity test
if all(temp['vif'][temp.index !=
'const'].astype(float) <= vif_thld): #Change - to exclude VIF of const
temp.loc[:,"adj_r_squared"]=results.rsquared_adj
## if adj r square greater than threshold
if any(temp['adj_r_squared'].astype(float) >
adj_r_sq_thld):
## calculate aic bic of the model
temp.loc[:,"aic_is"]=results.aic #
Changed on 12Mar2024
temp.loc[:,"bic_is"]=results.bic #
Changed on 12Mar2024
## calculate insample rmse and mape of
dev sample of the model
y_pred=results.predict(x)
###################################################################################
#####################
## Define Train and Test Sample for
Validation # Changed on 12Mar2024
# Independent
x_var_train = x_var.loc[0:
(x_var.shape[0]-4)-1,:]
x_var_test = x_var.loc[(x_var.shape[0]-
4):(x_var.shape[0]),:]
# Dependent
y_train = y[0:len(y)-4]
y_test = y[len(y)-4:len(y)]
## add constant
x_val = sm.add_constant(x_var_train,
has_constant='add')
x_test = sm.add_constant(x_var_test,
has_constant='add')
## filling regression
results1 = sm.OLS(y_train,x_val).fit()
temp.loc[:,"dev_mape_os"]=mean_absolute_percentage_error(y_test, y_pred1)
temp.loc[:,"dev_rmse_os"]=(metrics.mean_squared_error(y_test, y_pred1))
###################################################################################
#######################
score=cross_val_score(SMWrapper(sm.OLS),x,y,cv=kf,scoring='neg_root_mean_squared_er
ror')
## calculate model kfold avg score
temp.loc[:,"kfold_score"]=score.mean()
temp.reset_index(inplace=True)
temp.rename(columns={'index':
'variable'},inplace=True)
## creatinh unique model id
temp.insert(loc=0, column='model_id',
value=id_cnt)
## creating variable for no of factor in
the model
temp.insert(loc=1,
column='no_of_factor', value=len(sel_var))
## appending each model output to final
data
final_reg_output_data=pd.concat([final_reg_output_data,temp])
id_cnt+=1
else:
## normality test for regression
temp.loc[:,"p_val_jarque_bera_test"]=results.summary2().tables[2].loc[results.summa
ry2().tables[2][2]=='Prob(JB):',3][2]
## homoscedasticity test for regression
temp.loc[:,"p_val_breusch_pagan_test"]=sms.het_breuschpagan(results.resid,
results.model.exog)[1]
## autocorrelation test for regression
temp.loc[:,"test_stat_durbin_watson"]=results.summary2().tables[2].loc[results.summ
ary2().tables[2][2]=='Durbin-Watson:',3][0]
## stationarity test for regression - adf and kpss
temp.loc[:,"p_val_adf_test"]=adfuller(results.resid)[1]
temp.loc[:,"p_val_kpss_test"]=kpss(results.resid)[1]
# linearity test for regression
temp.loc[:,"p_val_rainbow_test"]= linear_rainbow(results)[1]
## multicollinearity test
temp.loc[:,"vif"]=[variance_inflation_factor(x.values, i) for i
in range(len(x.columns))]
# adj r square greater than threshold
temp.loc[:,"adj_r_squared"]=results.rsquared_adj
temp.loc[:,"aic_is"]=results.aic # Changed on 12Mar2024
temp.loc[:,"bic_is"]=results.bic # Changed on 12Mar2024
## calculate insample rmse and mape of dev sample of the model
y_pred=results.predict(x)
temp.loc[:,"dev_mape_is"]=mean_absolute_percentage_error(y,
y_pred) # Changed on 12Mar2024
temp.loc[:,"dev_rmse_is"]=(metrics.mean_squared_error(y,
y_pred)) # Changed on 12Mar2024
###################################################################################
#####################
## Define Train and Test Sample for Validation # Changed on
12Mar2024
# Independent
x_var_train = x_var.loc[0:(x_var.shape[0]-4)-1,:]
x_var_test = x_var.loc[(x_var.shape[0]-4):(x_var.shape[0]),:]
# Dependent
y_train = y[0:len(y)-4]
y_test = y[len(y)-4:len(y)]
## add constant
x_val = sm.add_constant(x_var_train, has_constant='add')
x_test = sm.add_constant(x_var_test, has_constant='add')
## filling regression
results1 = sm.OLS(y_train,x_val).fit()
temp.loc[:,"dev_mape_os"]=mean_absolute_percentage_error(y_test, y_pred1)
temp.loc[:,"dev_rmse_os"]=np.sqrt(mean_squared_error(y_test,
y_pred1))
###################################################################################
#######################
score=cross_val_score(SMWrapper(sm.OLS),x,y,cv=kf,scoring='neg_root_mean_squared_er
ror')
## calculate model kfold avg score
temp.loc[:,"kfold_score"]=score.mean()
temp.reset_index(inplace=True)
temp.rename(columns={'index': 'variable'},inplace=True)
## creatinh unique model id
temp.insert(loc=0, column='model_id', value=id_cnt)
## creating variable for no of factor in the model
temp.insert(loc=1, column='no_of_factor', value=len(sel_var))
## appending each model output to final data
final_reg_output_data=pd.concat([final_reg_output_data,temp])
id_cnt+=1
final_reg_output_data.sort_values(by=['adj_r_squared','model_id'],ascending=False,i
nplace=True)
return(final_reg_output_data_full,final_reg_output_data)