0% found this document useful (0 votes)
33 views7 pages

New Text Document

Uploaded by

Gaurav Soni
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
33 views7 pages

New Text Document

Uploaded by

Gaurav Soni
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

##14.

Perform exhaustive search to get model statistics


#######################################################################
###################################################################################
############################################
def mean_absolute_percentage_error(y_true,y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred)/ y_true))*100

def exhaustive_search_reg(dep_data=odr_data,
inp_data=mev_output_data,
exp_sign_data=MEV_Additional_Information,
max_no_ind_var=2,
sig_lvl=.05,
dw_thld=[1.5,2.5],
vif_thld=2,
adj_r_sq_thld=0.5,
no_of_kfold=2):
"""

Parameters
----------
dep_data : dataframe, Mandatory
DESCRIPTION. The default is odr_data.
inp_data : dataframe, Mandatory
DESCRIPTION. The default is mev_output_data.
MEV_Additional_Information : dataframe, Mandatory
DESCRIPTION. The default is MEV_Additional_Information.
max_no_ind_var : int, optional
DESCRIPTION. The default is 2.restrict the maximum no of factors in the
model.
sig_lvl : float, optional
DESCRIPTION. The default is .05.
dw_thld : list with two numeric element, optional
DESCRIPTION. The default is [1.5,2.5].
vif_thld : int, optional
DESCRIPTION. The default is 2.
adj_r_sq_thld : float, optional
DESCRIPTION. The default is 0.5.
no_of_kfold : int, optional
DESCRIPTION. The default is 2.

Returns
-------
dataframe.
which contaion candidate models from ols regression

"""
reg_data=pd.merge(dep_data,inp_data,how="inner",on="date").dropna(axis=1)
y=reg_data['odr']
## transform dep var in log odd space
y=np.log(y/(1-y))
reg_ind_set=reg_data.drop(['date','odr'],axis=1)
## create combination of variable upto max_no_of factor
var_list=[]
for max_comb in range(1,max_no_ind_var+1):
comb=combinations(reg_ind_set.columns,max_comb)
var_list.append([i for i in comb])
var_list1=[element for innerList in var_list for element in innerList]
## fetching full list of model
final_reg_output_data_full=pd.DataFrame()
id_cnt_full=1
for sel_var in var_list1:
x_var=reg_ind_set.loc[:,sel_var].reset_index(drop=True) # Changed on
12Mar2024
y = y.reset_index(drop=True) # Changed on 12Mar2024
## add constant
x = sm.add_constant(x_var, has_constant='add')
## filling regression
results1 = sm.OLS(y,x).fit()
temp1=pd.DataFrame(results1.summary2().tables[1])
temp1=temp1.iloc[:,0:4]
temp1.columns=["coefficient","standard_error","t_value","p_value"]
## for full list of model fetch

temp1.loc[:,"p_val_jarque_bera_test"]=results1.summary2().tables[2].loc[results1.su
mmary2().tables[2][2]=='Prob(JB):',3][2]

temp1.loc[:,"p_val_breusch_pagan_test"]=sms.het_breuschpagan(results1.resid,
results1.model.exog)[1]

temp1.loc[:,"test_stat_durbin_watson"]=results1.summary2().tables[2].loc[results1.s
ummary2().tables[2][2]=='Durbin-Watson:',3][0]
temp1.loc[:,"p_val_adf_test"]=adfuller(results1.resid)[1]
temp1.loc[:,"p_val_kpss_test"]=kpss(results1.resid)[1]
temp1.loc[:,"p_val_rainbow_test"]= linear_rainbow(results1)[1]
temp1.loc[:,"vif"]=[variance_inflation_factor(x.values, i) for i in
range(len(x.columns))]
temp1.loc[:,"adj_r_squared"]=results1.rsquared_adj
temp1.loc[:,"aic_is"]=results1.aic # Changed on 12Mar2024
temp1.loc[:,"bic_is"]=results1.bic # Changed on 12Mar2024
## calculate insample rmse and mape of dev sample of the model
y_pred=results1.predict(x)
temp1.loc[:,"dev_mape_is"]= mean_absolute_percentage_error(y, y_pred) #
Changed on 12Mar2024
temp1.loc[:,"dev_rmse_is"]=np.sqrt(metrics.mean_squared_error(y, y_pred)) #
Changed on 12Mar2024

###################################################################################
############################################################
## Define Train and Test Sample for Validation # Changed on 12Mar2024
# Independent
x_var_train1 = x_var.loc[0:(x_var.shape[0]-4)-1,:]
x_var_test1 = x_var.loc[(x_var.shape[0]-4):(x_var.shape[0]),:]

# Dependent
y_train1 = y[0:len(y)-4]
y_test1 = y[len(y)-4:len(y)]

## add constant
x_val1 = sm.add_constant(x_var_train1, has_constant='add')
x_test1 = sm.add_constant(x_var_test1, has_constant='add')

## filling regression
results2 = sm.OLS(y_train1,x_val1).fit()

## calculate out of sample aic bic of the model


temp1.loc[:,"aic_os"]=results2.aic
temp1.loc[:,"bic_os"]=results2.bic
## calculate out of sample rmse and mape of dev sample of the model
y_pred2=results2.predict(x_test1)
temp1.loc[:,"dev_mape_os"]= mean_absolute_percentage_error(y_test1,
y_pred2)
temp1.loc[:,"dev_rmse_os"]= (metrics.mean_squared_error(y_test1, y_pred2))
###################################################################################
##############################################################

##k fold validation


kf=KFold(n_splits=no_of_kfold)

score=cross_val_score(SMWrapper(sm.OLS),x,y,cv=kf,scoring='neg_root_mean_squared_er
ror')
## calculate model kfold avg score
temp1.loc[:,"kfold_score"]=score.mean()
temp1.reset_index(inplace=True)
temp1.rename(columns={'index': 'variable'},inplace=True)
## creatinh unique model id
temp1.insert(loc=0, column='model_id', value=id_cnt_full)
## creating variable for no of factor in the model
temp1.insert(loc=1, column='no_of_factor', value=len(sel_var))
id_cnt_full+=1
final_reg_output_data_full=pd.concat([final_reg_output_data_full,temp1])

## selecting one from each group.suppose from gdp group may be many
transformation but in model only one from gdp group will be selected
var_list2=[tuple(vars[tp].split("_")[0] for tp in range(len(vars))) for vars in
var_list1]
diffr=[ len(set(vars))-len(vars) for vars in var_list2 ]

var_data=pd.DataFrame({'var_list1':var_list1,'var_list2':var_list1,'diff':diffr})
var_list3=list(var_data.loc[var_data['diff']==0,'var_list1'])

var_list3 = var_list1.copy()
## running reg in loop
final_reg_output_data=pd.DataFrame()
id_cnt=1
for sel_var in var_list3:
x_var=reg_ind_set.loc[:,sel_var].reset_index(drop=True) # Changed on
12Mar2024
## add constant
x = sm.add_constant(x_var)
## filling regression
results = sm.OLS(y,x).fit()
temp=pd.DataFrame(results.summary2().tables[1])
temp=temp.iloc[:,0:4]
temp.columns=["coefficient","standard_error","t_value","p_value"]
## pvalue test for significance of model variables
temp[temp.index != 'const']['p_value']
if all(temp[temp.index != 'const']['p_value'] <= sig_lvl): # Changed on
26th Feb
## expected sign check
sign_chk_data=pd.DataFrame({'act_sign':np.sign(temp.iloc[1:,0])})
sign_chk_data.reset_index(inplace=True)
sign_chk_data["mev"]=[text.split("_")[0] for text in
sign_chk_data['index']]

sign_chk_data1=pd.merge(sign_chk_data,MEV_Additional_Information,how="left",on='mev
')
## expected sign test wheter expected sign match with actual sign
if (all(sign_chk_data1['act_sign']==sign_chk_data1['exp_sign'])) :
# Change - added if condition '| (all(sign_chk_data1['act_sign']!
=sign_chk_data1['exp_sign']))' to bypass this check
# Change - 26th Feb - from all to any in if condition
if flag_applycheck == True:

temp.loc[:,"p_val_jarque_bera_test"]=results.summary2().tables[2].loc[results.summa
ry2().tables[2][2]=='Prob(JB):',3][2]
## normality test for regression
if any(temp['p_val_jarque_bera_test'].astype(float) >=
sig_lvl):

temp.loc[:,"p_val_breusch_pagan_test"]=sms.het_breuschpagan(results.resid,
results.model.exog)[1]
## homoscedasticity test for regression
if any(temp['p_val_breusch_pagan_test'].astype(float) >=
sig_lvl):

temp.loc[:,"test_stat_durbin_watson"]=results.summary2().tables[2].loc[results.summ
ary2().tables[2][2]=='Durbin-Watson:',3][0]
## autocorrelation test for regression
if (any(temp['test_stat_durbin_watson'].astype(float)
>= dw_thld[0])) | (any(temp['test_stat_durbin_watson'].astype(float) <=
dw_thld[1])):

temp.loc[:,"p_val_adf_test"]=adfuller(results.resid)[1]
temp.loc[:,"p_val_kpss_test"]=kpss(results.resid)
[1]
## stationarity test for regression
#if (any(temp['p_val_adf_test'].astype(float) >
sig_lvl)) | (any(temp['p_val_kpss_test'].astype(float) <= sig_lvl)):
temp.loc[:,"p_val_rainbow_test"]=
linear_rainbow(results)[1]
if (any(temp['p_val_rainbow_test'].astype(float)
>= sig_lvl)):

temp.loc[:,"vif"]=[variance_inflation_factor(x.values, i) for i in
range(len(x.columns))]
## multicollinearity test
if all(temp['vif'][temp.index !=
'const'].astype(float) <= vif_thld): #Change - to exclude VIF of const

temp.loc[:,"adj_r_squared"]=results.rsquared_adj
## if adj r square greater than threshold
if any(temp['adj_r_squared'].astype(float) >
adj_r_sq_thld):
## calculate aic bic of the model
temp.loc[:,"aic_is"]=results.aic #
Changed on 12Mar2024
temp.loc[:,"bic_is"]=results.bic #
Changed on 12Mar2024
## calculate insample rmse and mape of
dev sample of the model
y_pred=results.predict(x)

temp.loc[:,"dev_mape_is"]=mean_absolute_percentage_error(y, y_pred) # Changed on


12Mar2024
temp.loc[:,"dev_rmse_is"]=(metrics.mean_squared_error(y, y_pred)) # Changed on
12Mar2024

###################################################################################
#####################
## Define Train and Test Sample for
Validation # Changed on 12Mar2024
# Independent
x_var_train = x_var.loc[0:
(x_var.shape[0]-4)-1,:]
x_var_test = x_var.loc[(x_var.shape[0]-
4):(x_var.shape[0]),:]

# Dependent
y_train = y[0:len(y)-4]
y_test = y[len(y)-4:len(y)]

## add constant

x_val = sm.add_constant(x_var_train,
has_constant='add')
x_test = sm.add_constant(x_var_test,
has_constant='add')

## filling regression
results1 = sm.OLS(y_train,x_val).fit()

## calculate out of sample aic bic of


the model
temp.loc[:,"aic_os"]=results1.aic
temp.loc[:,"bic_os"]=results1.bic
## calculate out of sample rmse and mape
of dev sample of the model
y_pred1=results1.predict(x_test)

temp.loc[:,"dev_mape_os"]=mean_absolute_percentage_error(y_test, y_pred1)

temp.loc[:,"dev_rmse_os"]=(metrics.mean_squared_error(y_test, y_pred1))

###################################################################################
#######################

##k fold validation


kf=KFold(n_splits=no_of_kfold)

score=cross_val_score(SMWrapper(sm.OLS),x,y,cv=kf,scoring='neg_root_mean_squared_er
ror')
## calculate model kfold avg score
temp.loc[:,"kfold_score"]=score.mean()
temp.reset_index(inplace=True)
temp.rename(columns={'index':
'variable'},inplace=True)
## creatinh unique model id
temp.insert(loc=0, column='model_id',
value=id_cnt)
## creating variable for no of factor in
the model
temp.insert(loc=1,
column='no_of_factor', value=len(sel_var))
## appending each model output to final
data

final_reg_output_data=pd.concat([final_reg_output_data,temp])
id_cnt+=1
else:
## normality test for regression

temp.loc[:,"p_val_jarque_bera_test"]=results.summary2().tables[2].loc[results.summa
ry2().tables[2][2]=='Prob(JB):',3][2]
## homoscedasticity test for regression

temp.loc[:,"p_val_breusch_pagan_test"]=sms.het_breuschpagan(results.resid,
results.model.exog)[1]
## autocorrelation test for regression

temp.loc[:,"test_stat_durbin_watson"]=results.summary2().tables[2].loc[results.summ
ary2().tables[2][2]=='Durbin-Watson:',3][0]
## stationarity test for regression - adf and kpss
temp.loc[:,"p_val_adf_test"]=adfuller(results.resid)[1]
temp.loc[:,"p_val_kpss_test"]=kpss(results.resid)[1]
# linearity test for regression
temp.loc[:,"p_val_rainbow_test"]= linear_rainbow(results)[1]
## multicollinearity test
temp.loc[:,"vif"]=[variance_inflation_factor(x.values, i) for i
in range(len(x.columns))]
# adj r square greater than threshold
temp.loc[:,"adj_r_squared"]=results.rsquared_adj
temp.loc[:,"aic_is"]=results.aic # Changed on 12Mar2024
temp.loc[:,"bic_is"]=results.bic # Changed on 12Mar2024
## calculate insample rmse and mape of dev sample of the model
y_pred=results.predict(x)
temp.loc[:,"dev_mape_is"]=mean_absolute_percentage_error(y,
y_pred) # Changed on 12Mar2024
temp.loc[:,"dev_rmse_is"]=(metrics.mean_squared_error(y,
y_pred)) # Changed on 12Mar2024

###################################################################################
#####################
## Define Train and Test Sample for Validation # Changed on
12Mar2024
# Independent
x_var_train = x_var.loc[0:(x_var.shape[0]-4)-1,:]
x_var_test = x_var.loc[(x_var.shape[0]-4):(x_var.shape[0]),:]

# Dependent
y_train = y[0:len(y)-4]
y_test = y[len(y)-4:len(y)]

## add constant
x_val = sm.add_constant(x_var_train, has_constant='add')
x_test = sm.add_constant(x_var_test, has_constant='add')

## filling regression
results1 = sm.OLS(y_train,x_val).fit()

## calculate out of sample aic bic of the model


temp.loc[:,"aic_os"]=results1.aic
temp.loc[:,"bic_os"]=results1.bic
## calculate out of sample rmse and mape of dev sample of the
model
y_pred1=results1.predict(x_test)

temp.loc[:,"dev_mape_os"]=mean_absolute_percentage_error(y_test, y_pred1)
temp.loc[:,"dev_rmse_os"]=np.sqrt(mean_squared_error(y_test,
y_pred1))

###################################################################################
#######################

##k fold validation


kf=KFold(n_splits=no_of_kfold)

score=cross_val_score(SMWrapper(sm.OLS),x,y,cv=kf,scoring='neg_root_mean_squared_er
ror')
## calculate model kfold avg score
temp.loc[:,"kfold_score"]=score.mean()
temp.reset_index(inplace=True)
temp.rename(columns={'index': 'variable'},inplace=True)
## creatinh unique model id
temp.insert(loc=0, column='model_id', value=id_cnt)
## creating variable for no of factor in the model
temp.insert(loc=1, column='no_of_factor', value=len(sel_var))
## appending each model output to final data
final_reg_output_data=pd.concat([final_reg_output_data,temp])
id_cnt+=1

## sorting data based on adj r square and model id


if len(final_reg_output_data)!=0:

final_reg_output_data.sort_values(by=['adj_r_squared','model_id'],ascending=False,i
nplace=True)
return(final_reg_output_data_full,final_reg_output_data)

You might also like