0% found this document useful (0 votes)
11 views4 pages

Data Exploration and Regression in Python With HBAT Dataset

Uploaded by

mani
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views4 pages

Data Exploration and Regression in Python With HBAT Dataset

Uploaded by

mani
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 4

#importing the libraries

import pandas as pd
import numpy as np
import statsmodels.api as sm
data = pd.read_csv('C:/Files/My Courses/MBA/Amrita/Coimbatore/DARP/DARP2024-25/Data
Files/HBAT.csv', sep=',')
var_names=data = pd.read_csv('C:/Files/My Courses/MBA/Amrita/Coimbatore/DARP/DARP2024-
25/Data Files/variablenames.csv', sep=',')
# Data Exploration
# data description
data.shape#dimention of the data set
len(data)#length of the data
data.info()# data structure
data.columns# coumn names
data.head(10)# first 10 rows
data.tail(10)# last 10 rows
# standardization
var_names
from scipy import stats
data['z_x6'] = stats.zscore(data['x6'])
data['z_x7'] = stats.zscore(data['x7'])
data['z_x8'] = stats.zscore(data['x8'])
data['z_x9'] = stats.zscore(data['x9'])
data['z_x10'] = stats.zscore(data['x10'])
data['z_x11'] = stats.zscore(data['x11'])
data['z_x12'] = stats.zscore(data['x12'])
data['z_x13'] = stats.zscore(data['x13'])
data['z_x14'] = stats.zscore(data['x14'])
data['z_x15'] = stats.zscore(data['x15'])
data['z_x16'] = stats.zscore(data['x16'])
data['z_x17'] = stats.zscore(data['x17'])
data['z_x18'] = stats.zscore(data['x18'])
data['z_x19'] = stats.zscore(data['x19'])

#Boxplot
import matplotlib.pyplot as plt
plt.boxplot(data.z_x6, labels=['x6'])
plt.boxplot(data.z_x7,labels=['x7'])
#there are outliers in x7
plt.boxplot(data.z_x8,labels=['x8'])
plt.boxplot(data.z_x9,labels=['x9'])
plt.boxplot(data.z_x10,labels=['x10'])
plt.boxplot(data.z_x11,labels=['x11'])
plt.boxplot(data.z_x12,labels=['x12'])
#there are outliers in x12
plt.boxplot(data.z_x13,labels=['x13'])
plt.boxplot(data.z_x14,labels=['x14'])
plt.boxplot(data.z_x15,labels=['x15'])
plt.boxplot(data.z_x16,labels=['x16'])
#there are outliers in x16
plt.boxplot(data.z_x17,labels=['x17'])
plt.boxplot(data.z_x18,labels=['x18'])
#there is an outlier in x18
plt.boxplot(data.z_x19,labels=['x19'])

#there are outliers in x7,x12,x16,x18

#x7
#Outlier detection
# finding the 1st quartile
#z_x7
x7_q1 = np.quantile(data.z_x7, 0.25)
# finding the 3rd quartile
x7_q3 = np.quantile(data.z_x7, 0.75)
x7_med = np.median(data.z_x7)

# finding the iqr region


x7_iqr = x7_q3-x7_q1

# finding upper and lower whiskers


x7_upper_bound = x7_q3+(1.5*x7_iqr)
x7_lower_bound = x7_q1-(1.5*x7_iqr)
print(x7_iqr, x7_upper_bound, x7_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x7_upper_array = np.where(data['z_x7'] >= x7_upper_bound)[0]
x7_lower_array = np.where(data['z_x7'] <= x7_lower_bound)[0]

print(x7_upper_array)
print(x7_lower_array)

#x12
#Outlier detection
# finding the 1st quartile
#z_x12
x12_q1 = np.quantile(data.z_x12, 0.25)
# finding the 3rd quartile
x12_q3 = np.quantile(data.z_x12, 0.75)
x12_med = np.median(data.z_x12)

# finding the iqr region


x12_iqr = x12_q3-x12_q1

# finding upper and lower whiskers


x12_upper_bound = x12_q3+(1.5*x12_iqr)
x12_lower_bound = x12_q1-(1.5*x12_iqr)
print(x12_iqr, x12_upper_bound, x12_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x12_upper_array = np.where(data['z_x12'] >= x12_upper_bound)[0]
x12_lower_array = np.where(data['z_x12'] <= x12_lower_bound)[0]

print(x12_upper_array)
print(x12_lower_array)

#z_x16
x16_q1 = np.quantile(data.z_x16, 0.25)
# finding the 3rd quartile
x16_q3 = np.quantile(data.z_x16, 0.75)
x16_med = np.median(data.z_x16)

# finding the iqr region


x16_iqr = x16_q3-x16_q1

# finding upper and lower whiskers


x16_upper_bound = x16_q3+(1.5*x16_iqr)
x16_lower_bound = x16_q1-(1.5*x16_iqr)
print(x16_iqr, x16_upper_bound, x16_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x16_upper_array = np.where(data['z_x16'] >= x16_upper_bound)[0]
x16_lower_array = np.where(data['z_x16'] <= x16_lower_bound)[0]

print(x16_upper_array)
print(x16_lower_array)

#x18
#Outlier detection
# finding the 1st quartile
#z_x18
x18_q1 = np.quantile(data.z_x18, 0.25)
# finding the 3rd quartile
x18_q3 = np.quantile(data.z_x18, 0.75)
x18_med = np.median(data.z_x18)

# finding the iqr region


x18_iqr = x18_q3-x18_q1

# finding upper and lower whiskers


x18_upper_bound = x18_q3+(1.5*x18_iqr)
x18_lower_bound = x18_q1-(1.5*x18_iqr)
print(x18_iqr, x18_upper_bound, x18_lower_bound)

# Create arrays of Boolean values indicating the outlier rows


x18_upper_array = np.where(data['z_x18'] >= x18_upper_bound)[0]
x18_lower_array = np.where(data['z_x18'] <= x18_lower_bound)[0]

print(x18_upper_array)
print(x18_lower_array)
outliers = np.concatenate((x7_upper_array, x7_lower_array,x12_upper_array,
x12_lower_array,x16_upper_array, x16_lower_array,x18_upper_array, x18_lower_array))
print(outliers)
#removing the duplicates
out= list(np.unique(outliers))
print(out)
#removing the outliers
data1=data.drop(index=out)
# Regression analysis
x=
pd.DataFrame(data1[['z_x6','z_x7','z_x8','z_x9','z_x10','z_x11','z_x12','z_x13','z_x14','z_x15','z_x16','z
_x17','z_x18']])
y = pd.DataFrame(data1[['z_x19']])
x = sm.add_constant(x)
#Estimation
model01 = sm.OLS(y, x).fit()
# results
model01.summary()

You might also like