0% found this document useful (0 votes)
11 views

Data Exploration and Regression in Python With HBAT Dataset

Uploaded by

mani
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views

Data Exploration and Regression in Python With HBAT Dataset

Uploaded by

mani
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 4

#importing the libraries

import pandas as pd
import numpy as np
import statsmodels.api as sm
data = pd.read_csv('C:/Files/My Courses/MBA/Amrita/Coimbatore/DARP/DARP2024-25/Data
Files/HBAT.csv', sep=',')
var_names=data = pd.read_csv('C:/Files/My Courses/MBA/Amrita/Coimbatore/DARP/DARP2024-
25/Data Files/variablenames.csv', sep=',')
# Data Exploration
# data description
data.shape#dimention of the data set
len(data)#length of the data
data.info()# data structure
data.columns# coumn names
data.head(10)# first 10 rows
data.tail(10)# last 10 rows
# standardization
var_names
from scipy import stats
data['z_x6'] = stats.zscore(data['x6'])
data['z_x7'] = stats.zscore(data['x7'])
data['z_x8'] = stats.zscore(data['x8'])
data['z_x9'] = stats.zscore(data['x9'])
data['z_x10'] = stats.zscore(data['x10'])
data['z_x11'] = stats.zscore(data['x11'])
data['z_x12'] = stats.zscore(data['x12'])
data['z_x13'] = stats.zscore(data['x13'])
data['z_x14'] = stats.zscore(data['x14'])
data['z_x15'] = stats.zscore(data['x15'])
data['z_x16'] = stats.zscore(data['x16'])
data['z_x17'] = stats.zscore(data['x17'])
data['z_x18'] = stats.zscore(data['x18'])
data['z_x19'] = stats.zscore(data['x19'])

#Boxplot
import matplotlib.pyplot as plt
plt.boxplot(data.z_x6, labels=['x6'])
plt.boxplot(data.z_x7,labels=['x7'])
#there are outliers in x7
plt.boxplot(data.z_x8,labels=['x8'])
plt.boxplot(data.z_x9,labels=['x9'])
plt.boxplot(data.z_x10,labels=['x10'])
plt.boxplot(data.z_x11,labels=['x11'])
plt.boxplot(data.z_x12,labels=['x12'])
#there are outliers in x12
plt.boxplot(data.z_x13,labels=['x13'])
plt.boxplot(data.z_x14,labels=['x14'])
plt.boxplot(data.z_x15,labels=['x15'])
plt.boxplot(data.z_x16,labels=['x16'])
#there are outliers in x16
plt.boxplot(data.z_x17,labels=['x17'])
plt.boxplot(data.z_x18,labels=['x18'])
#there is an outlier in x18
plt.boxplot(data.z_x19,labels=['x19'])

#there are outliers in x7,x12,x16,x18

#x7
#Outlier detection
# finding the 1st quartile
#z_x7
x7_q1 = np.quantile(data.z_x7, 0.25)
# finding the 3rd quartile
x7_q3 = np.quantile(data.z_x7, 0.75)
x7_med = np.median(data.z_x7)

# finding the iqr region


x7_iqr = x7_q3-x7_q1

# finding upper and lower whiskers


x7_upper_bound = x7_q3+(1.5*x7_iqr)
x7_lower_bound = x7_q1-(1.5*x7_iqr)
print(x7_iqr, x7_upper_bound, x7_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x7_upper_array = np.where(data['z_x7'] >= x7_upper_bound)[0]
x7_lower_array = np.where(data['z_x7'] <= x7_lower_bound)[0]

print(x7_upper_array)
print(x7_lower_array)

#x12
#Outlier detection
# finding the 1st quartile
#z_x12
x12_q1 = np.quantile(data.z_x12, 0.25)
# finding the 3rd quartile
x12_q3 = np.quantile(data.z_x12, 0.75)
x12_med = np.median(data.z_x12)

# finding the iqr region


x12_iqr = x12_q3-x12_q1

# finding upper and lower whiskers


x12_upper_bound = x12_q3+(1.5*x12_iqr)
x12_lower_bound = x12_q1-(1.5*x12_iqr)
print(x12_iqr, x12_upper_bound, x12_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x12_upper_array = np.where(data['z_x12'] >= x12_upper_bound)[0]
x12_lower_array = np.where(data['z_x12'] <= x12_lower_bound)[0]

print(x12_upper_array)
print(x12_lower_array)

#z_x16
x16_q1 = np.quantile(data.z_x16, 0.25)
# finding the 3rd quartile
x16_q3 = np.quantile(data.z_x16, 0.75)
x16_med = np.median(data.z_x16)

# finding the iqr region


x16_iqr = x16_q3-x16_q1

# finding upper and lower whiskers


x16_upper_bound = x16_q3+(1.5*x16_iqr)
x16_lower_bound = x16_q1-(1.5*x16_iqr)
print(x16_iqr, x16_upper_bound, x16_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x16_upper_array = np.where(data['z_x16'] >= x16_upper_bound)[0]
x16_lower_array = np.where(data['z_x16'] <= x16_lower_bound)[0]

print(x16_upper_array)
print(x16_lower_array)

#x18
#Outlier detection
# finding the 1st quartile
#z_x18
x18_q1 = np.quantile(data.z_x18, 0.25)
# finding the 3rd quartile
x18_q3 = np.quantile(data.z_x18, 0.75)
x18_med = np.median(data.z_x18)

# finding the iqr region


x18_iqr = x18_q3-x18_q1

# finding upper and lower whiskers


x18_upper_bound = x18_q3+(1.5*x18_iqr)
x18_lower_bound = x18_q1-(1.5*x18_iqr)
print(x18_iqr, x18_upper_bound, x18_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x18_upper_array = np.where(data['z_x18'] >= x18_upper_bound)[0]
x18_lower_array = np.where(data['z_x18'] <= x18_lower_bound)[0]

print(x18_upper_array)
print(x18_lower_array)
outliers = np.concatenate((x7_upper_array, x7_lower_array,x12_upper_array,
x12_lower_array,x16_upper_array, x16_lower_array,x18_upper_array, x18_lower_array))
print(outliers)
#removing the duplicates
out= list(np.unique(outliers))
print(out)
#removing the outliers
data1=data.drop(index=out)
# Regression analysis
x=
pd.DataFrame(data1[['z_x6','z_x7','z_x8','z_x9','z_x10','z_x11','z_x12','z_x13','z_x14','z_x15','z_x16','z
_x17','z_x18']])
y = pd.DataFrame(data1[['z_x19']])
x = sm.add_constant(x)
#Estimation
model01 = sm.OLS(y, x).fit()
# results
model01.summary()

You might also like