Data Exploration and Regression in Python With HBAT Dataset
Data Exploration and Regression in Python With HBAT Dataset
import pandas as pd
import numpy as np
import statsmodels.api as sm
data = pd.read_csv('C:/Files/My Courses/MBA/Amrita/Coimbatore/DARP/DARP2024-25/Data
Files/HBAT.csv', sep=',')
var_names=data = pd.read_csv('C:/Files/My Courses/MBA/Amrita/Coimbatore/DARP/DARP2024-
25/Data Files/variablenames.csv', sep=',')
# Data Exploration
# data description
data.shape#dimention of the data set
len(data)#length of the data
data.info()# data structure
data.columns# coumn names
data.head(10)# first 10 rows
data.tail(10)# last 10 rows
# standardization
var_names
from scipy import stats
data['z_x6'] = stats.zscore(data['x6'])
data['z_x7'] = stats.zscore(data['x7'])
data['z_x8'] = stats.zscore(data['x8'])
data['z_x9'] = stats.zscore(data['x9'])
data['z_x10'] = stats.zscore(data['x10'])
data['z_x11'] = stats.zscore(data['x11'])
data['z_x12'] = stats.zscore(data['x12'])
data['z_x13'] = stats.zscore(data['x13'])
data['z_x14'] = stats.zscore(data['x14'])
data['z_x15'] = stats.zscore(data['x15'])
data['z_x16'] = stats.zscore(data['x16'])
data['z_x17'] = stats.zscore(data['x17'])
data['z_x18'] = stats.zscore(data['x18'])
data['z_x19'] = stats.zscore(data['x19'])
#Boxplot
import matplotlib.pyplot as plt
plt.boxplot(data.z_x6, labels=['x6'])
plt.boxplot(data.z_x7,labels=['x7'])
#there are outliers in x7
plt.boxplot(data.z_x8,labels=['x8'])
plt.boxplot(data.z_x9,labels=['x9'])
plt.boxplot(data.z_x10,labels=['x10'])
plt.boxplot(data.z_x11,labels=['x11'])
plt.boxplot(data.z_x12,labels=['x12'])
#there are outliers in x12
plt.boxplot(data.z_x13,labels=['x13'])
plt.boxplot(data.z_x14,labels=['x14'])
plt.boxplot(data.z_x15,labels=['x15'])
plt.boxplot(data.z_x16,labels=['x16'])
#there are outliers in x16
plt.boxplot(data.z_x17,labels=['x17'])
plt.boxplot(data.z_x18,labels=['x18'])
#there is an outlier in x18
plt.boxplot(data.z_x19,labels=['x19'])
#x7
#Outlier detection
# finding the 1st quartile
#z_x7
x7_q1 = np.quantile(data.z_x7, 0.25)
# finding the 3rd quartile
x7_q3 = np.quantile(data.z_x7, 0.75)
x7_med = np.median(data.z_x7)
print(x7_upper_array)
print(x7_lower_array)
#x12
#Outlier detection
# finding the 1st quartile
#z_x12
x12_q1 = np.quantile(data.z_x12, 0.25)
# finding the 3rd quartile
x12_q3 = np.quantile(data.z_x12, 0.75)
x12_med = np.median(data.z_x12)
print(x12_upper_array)
print(x12_lower_array)
#z_x16
x16_q1 = np.quantile(data.z_x16, 0.25)
# finding the 3rd quartile
x16_q3 = np.quantile(data.z_x16, 0.75)
x16_med = np.median(data.z_x16)
print(x16_upper_array)
print(x16_lower_array)
#x18
#Outlier detection
# finding the 1st quartile
#z_x18
x18_q1 = np.quantile(data.z_x18, 0.25)
# finding the 3rd quartile
x18_q3 = np.quantile(data.z_x18, 0.75)
x18_med = np.median(data.z_x18)
print(x18_upper_array)
print(x18_lower_array)
outliers = np.concatenate((x7_upper_array, x7_lower_array,x12_upper_array,
x12_lower_array,x16_upper_array, x16_lower_array,x18_upper_array, x18_lower_array))
print(outliers)
#removing the duplicates
out= list(np.unique(outliers))
print(out)
#removing the outliers
data1=data.drop(index=out)
# Regression analysis
x=
pd.DataFrame(data1[['z_x6','z_x7','z_x8','z_x9','z_x10','z_x11','z_x12','z_x13','z_x14','z_x15','z_x16','z
_x17','z_x18']])
y = pd.DataFrame(data1[['z_x19']])
x = sm.add_constant(x)
#Estimation
model01 = sm.OLS(y, x).fit()
# results
model01.summary()