0% found this document useful (0 votes)
16 views

Data Analysis Report With EDA

EDA

Uploaded by

pramatosh ray
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views

Data Analysis Report With EDA

EDA

Uploaded by

pramatosh ray
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 71

Data Analysis Report with EDA

August 6, 2023

[1]: import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

[2]: import warnings


warnings.filterwarnings("ignore")

[3]: %matplotlib inline

[4]: def getDescribeOfNumericCols(df):


# apply(lambda x: '%.3f' %x).
output=df.describe().apply(lambda x: x.apply('{0:.3f}'.format)).T
output=output.reset_index()
output=output.rename(columns = {output.columns[0]:'Columns'})
output=output.drop('count',axis=1)
return output

[5]: def getNumberOfUnique(df):


output=[]
for var in df.columns:
output.append([var,df[var].nunique()])
return pd.DataFrame(output,columns=['Columns','No_Unique'])

[6]: def getDataTypes(df):


output=pd.DataFrame(df.dtypes)
output=output.reset_index()
output=output.rename(columns = {output.columns[0]:'Columns',output.
↪columns[1]:'DataTypes'})

output['Count']=df.shape[0]
return output

[7]: def getNullReport(df):


output=[]
for var in df.columns:

1
output.append([var,df[var].isnull().sum(),round(df[var].isnull().
↪ sum()*100/df.shape[0],2)])
return pd.DataFrame(output,columns=['Columns','Null_Count','Null_Percent'])

[8]: def getModeValues(df):


output=pd.DataFrame()
for var in df.columns:
temp=pd.DataFrame(df[var].value_counts(normalize=True)*100).head(1).
↪reset_index()

temp_2=pd.DataFrame(df[var].value_counts().head(1).reset_index())
temp['Columns']=var
temp['Mode']=temp['index']
temp['Frequency']=temp_2[var]
temp['Frequency_%']=temp[var]
temp=temp.drop(['index',var],axis=1)
output=pd.concat([output,temp],axis=0)
output=output.reset_index()
output=output.drop('index',axis=1)
return output

[9]: def getDataQualityReport(df):


data_quality = getDataTypes(df)
data_quality = data_quality.
↪merge(getNumberOfUnique(df),on='Columns',how='left')

data_quality = data_quality.merge(getNullReport(df),on='Columns',how='left')
data_quality = data_quality.
↪merge(getDescribeOfNumericCols(df),on='Columns',how='left')

data_quality = data_quality.merge(getModeValues(df),on='Columns',how='left')
return data_quality

[178]: data = pd.read_csv("Customer-Churn-Records.csv")

[11]: data_quality = getDataQualityReport(data)


data_quality

[11]: Columns DataTypes Count No_Unique Null_Count Null_Percent \


0 RowNumber int64 10000 10000 0 0.0
1 CustomerId int64 10000 10000 0 0.0
2 Surname object 10000 2932 0 0.0
3 CreditScore int64 10000 460 0 0.0
4 Geography object 10000 3 0 0.0
5 Gender object 10000 2 0 0.0
6 Age int64 10000 70 0 0.0
7 Tenure int64 10000 11 0 0.0
8 Balance float64 10000 6382 0 0.0
9 NumOfProducts int64 10000 4 0 0.0
10 HasCrCard int64 10000 2 0 0.0

2
11 IsActiveMember int64 10000 2 0 0.0
12 EstimatedSalary float64 10000 9999 0 0.0
13 Exited int64 10000 2 0 0.0
14 Complain int64 10000 2 0 0.0
15 Satisfaction Score int64 10000 5 0 0.0
16 Card Type object 10000 4 0 0.0
17 Point Earned int64 10000 785 0 0.0

mean std min 25% 50% \


0 5000.500 2886.896 1.000 2500.750 5000.500
1 15690940.569 71936.186 15565701.000 15628528.250 15690738.000
2 NaN NaN NaN NaN NaN
3 650.529 96.653 350.000 584.000 652.000
4 NaN NaN NaN NaN NaN
5 NaN NaN NaN NaN NaN
6 38.922 10.488 18.000 32.000 37.000
7 5.013 2.892 0.000 3.000 5.000
8 76485.889 62397.405 0.000 0.000 97198.540
9 1.530 0.582 1.000 1.000 1.000
10 0.706 0.456 0.000 0.000 1.000
11 0.515 0.500 0.000 0.000 1.000
12 100090.240 57510.493 11.580 51002.110 100193.915
13 0.204 0.403 0.000 0.000 0.000
14 0.204 0.403 0.000 0.000 0.000
15 3.014 1.406 1.000 2.000 3.000
16 NaN NaN NaN NaN NaN
17 606.515 225.925 119.000 410.000 605.000

75% max Mode Frequency Frequency_%


0 7500.250 10000.000 1 1 0.01
1 15753233.750 15815690.000 15634602 1 0.01
2 NaN NaN Smith 32 0.32
3 718.000 850.000 850 233 2.33
4 NaN NaN France 5014 50.14
5 NaN NaN Male 5457 54.57
6 44.000 92.000 37 478 4.78
7 7.000 10.000 2 1048 10.48
8 127644.240 250898.090 0.0 3617 36.17
9 2.000 4.000 1 5084 50.84
10 1.000 1.000 1 7055 70.55
11 1.000 1.000 1 5151 51.51
12 149388.247 199992.480 24924.92 2 0.02
13 0.000 1.000 0 7962 79.62
14 0.000 1.000 0 7956 79.56
15 4.000 5.000 3 2042 20.42
16 NaN NaN DIAMOND 2507 25.07
17 801.000 1000.000 408 26 0.26

3
[55]: data_quality.to_csv('Data_Quality_Report.csv',index=False)

0.1 EDA
[12]: import os
root_dir = "."
image_path = os.path.join(root_dir,"images")
os.makedirs(image_path,exist_ok=True)

[13]: import os
root_dir = "."
excel_path = os.path.join(root_dir,"excel")
os.makedirs(excel_path,exist_ok=True)

[14]: def save_fig(fig_id,tight_layout=True,fig_extension="png",resolution=300):


path=os.path.join(image_path,fig_id+"."+fig_extension)
print("Saving Figure",fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension,dpi=resolution)

0.1.1 Plots Functions


[15]: def getvariableListByType(df):
numerical_variables =[]
categorical_variables=[]
unknown_variables=[]

for var in df.columns:


if df[var].dtype in ['float8','float16','float32','float64',
'int8','int16','int32','int64',
'uint8','uint16','uint32','uint64']:
numerical_variables.append(var)
elif df[var].dtype in ['object','bool']:
categorical_variables.append(var)
else:
unknown_variables.append(var)
return numerical_variables,categorical_variables,unknown_variables

[16]: def getCrosstab(var1,var2,df):


print(f"Crosstab for {var1} vs {var2}")
print("-------------------------------")
print("Crosstab for Counts")
print("-------------------------------")
print(pd.crosstab(df[var1],df[var2],margins=True))
print("-------------------------------")

4
print("Crosstab for Row wise Percentage")
print("-------------------------------")
print(pd.crosstab(df[var1],df[var2],margins=True).apply(lambda r: r*200/r.
↪sum(),axis=1))

print("-------------------------------")
print("Crosstab for Column wise Percentage")
print("-------------------------------")
print(pd.crosstab(df[var1],df[var2],margins=True).apply(lambda r: r*200/r.
↪sum(),axis=0))

[17]: def getValueCounts(df,variableList,minSamples=30):


sl_no = 0
for var in variableList:
if df[var].nunique()<minSamples:
sl_no+=1
print(f"| {sl_no} | {var} |")
print("----------------------")
print(df[var].value_counts())
print("----------------------")

[18]: def createHistogramPlots(df,numerical_variables,minSamples=30):


for var in numerical_variables:
if df[var].nunique()>=minSamples:
sns.histplot(data=df,x=var,bins=25,kde=True,color="red")
plt.title(f"Histogram of {var}")
save_fig(f"Histogram of {var}")
plt.show()

[19]: def␣
↪createHistogramPlotsWithHue(df,numerical_variables,variabl_hue,minSamples=30):

for var in numerical_variables:


if df[var].nunique()>=minSamples:
sns.
↪histplot(data=df,x=var,hue=variabl_hue,bins=25,kde=True,color="red")

plt.title(f"Histogram of {var} with Hue {variabl_hue}")


save_fig(f"Histogram of {var} with Hue {variabl_hue}")
plt.show()

[20]: def createKDEPlots(df,numerical_variables,minSamples=30):


for var in numerical_variables:
if df[var].nunique()>=minSamples:
sns.kdeplot(data.loc[data['Survived']==0,var],␣
↪shade=True,color="r", label='Not Survived', alpha=.7)

sns.kdeplot(data.loc[data['Survived']==1,var], shade=True,␣
↪color="c", label='Survived', alpha=.6)

5
plt.title(f"Density plot of {var}")
save_fig(f"Density plot of {var}")
plt.show()

[21]: def createBoxPlots(df,numerical_variables,minSamples=30):


for var in numerical_variables:
if df[var].nunique()>=minSamples:
sns.boxplot(data=df,y=var)
plt.title(f"Boxplot of {var}")
save_fig(f"Boxplot of {var}")
plt.show()

[22]: def createBoxPlotsWithHue(df,numerical_variables,variabl_hue,minSamples=30):


for var in numerical_variables:
if df[var].nunique()>=minSamples:
sns.boxplot(data=df,y=var,x=variabl_hue)
plt.title(f"Boxplot of {var} with Hue {variabl_hue}")
save_fig(f"Boxplot of {var} with Hue {variabl_hue}")
plt.show()

[23]: def createBarChart(df,varibleList,maxSamples=30,threshold=10):


sl_no=0
for var in varibleList:
if df[var].nunique()<maxSamples:
sl_no+=1
print(f"| {sl_no} | {var} |")
print("----------------------")
ax = df[var].value_counts().plot(kind="bar")

#Add count values to the bars


# for p in ax.cotainers:
# ax.bar_label(p,label_type="edge",fontsize=10)

# for container in ax.containers:


# for bar in container:
# height=bar.get_height()
# if height>threshold:
# width=bar.get_width()
# x = bar.get_x()
# y = bar.get_y()

# ax.annotate(f'{height}', xy=(x+width/2, y+height/


2),xytext=(0,3),

# textcoords="offset points",␣
ha='center',va='center', rotation=90)

plt.title(f'Count Plot of {var}')


plt.xticks(rotation=90)

6
save_fig(f'Count Plot of {var}')
plt.show()
print("----------------------")

[24]: def␣
↪createBarChartWithHue(df,varibleList,variable_hue,maxSamples=30,threshold=10):

sl_no=0
for var in varibleList:
if df[var].nunique()<maxSamples:
sl_no+=1
print(f"| {sl_no} | {var} |")
print("----------------------")
ax = sns.countplot(x=var, data=df,hue=variable_hue)
ax.legend(title=variable_hue, bbox_to_anchor=(1,1.02),loc='upper␣
↪left')

plt.title(f'Count Plot of {var} with Hue {variable_hue}')


plt.xticks(rotation=90)
save_fig(f'Count Plot of {var} with Hue {variable_hue}')
plt.show()
print("----------------------")

[25]: def custom_autocpt(val):


if val<5:
return ""
else:
return f'{val:.1f}%'

[26]: def createPieChart(df,varibleList,size=(8,6),maxSamples=30):


sl_no=0
for var in varibleList:
if df[var].nunique()<maxSamples:
sl_no+=1
print(f"| {sl_no} | {var} |")
print("----------------------")
plt.figure(figsize=size)

df[var].value_counts().plot(kind="pie",startangle=0,
autopct=custom_autocpt,ylabel='',
labeldistance=None,fontsize=10)

plt.title(f'Pie Plot of {var}')


plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1,1.02),loc='upper left')
save_fig(f'Pie Plot of {var}')
plt.show()
print("----------------------")

7
[27]: def␣
↪createViolinPlotwithHue(df,numerical_variables,cross_var,variable_hue,minSamples=30):

for var in numerical_variables:


if df[var].nunique()>=minSamples:
sns.violinplot(cross_var,var, hue=variable_hue, data=df,split=True)
plt.title(f"Violin Plot of {var} & {cross_var} with Hue␣
↪{variable_hue}")

save_fig(f"Violin Plot of {var} & {cross_var} with Hue␣


↪{variable_hue}")

plt.show()

[28]: def createScatterPlt(df,var_x,var_y,var_hue=None):


sns.scatterplot(data=df,x=var_x,y=var_y,color='b',hue=var_hue)
plt.title(f"Scatter Plot of {var_y} vs {var_x}")
plt.xlabel(f"{var_x}")
plt.ylabel(f"{var_y}")
plt.show()

[29]: def createCrosstabBarChart (var1,var2,df,figsize=(6,4),threshold=10):


plt.figure(figsize=figsize,dpi=300)
crosstab_df = pd.crosstab(df[var1],df[var2]).apply(lambda r: r*100/r.
↪sum(),axis=1)

ax = crosstab_df.plot(kind="bar", stacked=True, rot=0)


ax.legend(title=var2,bbox_to_anchor=(1,1.02), loc='upper left')
ax.set_xlabel(var1)
ax.tick_params(axis='x', rotation =90)
ax.set_ylabel("Count")
ax.set_title(f"Stacked Bar Chart: Crosstab for Row Wise %")

for container in ax.containers:


for bar in container:
height = bar.get_height()
if height > threshold:
width = bar.get_width()
x=bar.get_x()
y=bar.get_y()
ax.annotate(f"{height:.2f}",xy=(x+width/2, y+height/
↪2),xytext=(0,3),

textcoords="offset points",␣
↪ha="center",va="center",rotation=90)

save_fig(f"Stacked Bar Cgart of Crosstab for row wise % for {var1} vs␣
↪{var2}")

plt.show()

8
0.1.2 Excel Crosstab
[45]: from openpyxl import Workbook

def getCrosstabInExcel(var1,var2,df,cmap='YlOrRd'):
ct1 = pd.crosstab(df[var1],df[var2]).style.
↪background_gradient(cmap=cmap,axis=None)

ct2 = pd.crosstab(df[var1],df[var2]).apply(lambda r: round(r*100/r.


↪sum(),2),axis=1).style.background_gradient(cmap=cmap,axis=None)

ct3 = pd.crosstab(df[var1],df[var2]).apply(lambda r: round(r*100/r.


↪sum(),2),axis=0).style.background_gradient(cmap=cmap,axis=None)

writer = pd.ExcelWriter(f"Excel/Crosstab for {var1} vs {var2}.xlsx",␣


↪engine="openpyxl")

text1 = pd.DataFrame({"Text":[f"Crosstab for {var1} vs {var2}"]})


start_row=0
text1.
↪to_excel(writer,sheet_name="Sheet",index=False,header=False,startrow=start_row)

start_row=1
ct1.to_excel(writer,sheet_name="Sheet",index=True,startrow=start_row)

start_row= start_row+len(ct1.data)+3
text2 = pd.DataFrame({"Text":["Crosstab for Row wise Percentage"]})
text2.
↪to_excel(writer,sheet_name="Sheet",index=False,header=False,startrow=start_row)

start_row=start_row+1
ct2.to_excel(writer,sheet_name="Sheet",index=True,startrow=start_row)

start_row= start_row+len(ct2.data)+3
text3 = pd.DataFrame({"Text":["Crosstab for Column wise Percentage"]})
text3.
↪to_excel(writer,sheet_name="Sheet",index=False,header=False,startrow=start_row)

start_row=start_row+1
ct3.to_excel(writer,sheet_name="Sheet",index=True,startrow=start_row)
writer.save()

0.1.3 Test on Data


[30]: numerical_variables, categorical_variabls, unknown_variables =␣
↪getvariableListByType(data)

[31]: numerical_variables

9
[31]: ['RowNumber',
'CustomerId',
'CreditScore',
'Age',
'Tenure',
'Balance',
'NumOfProducts',
'HasCrCard',
'IsActiveMember',
'EstimatedSalary',
'Exited',
'Complain',
'Satisfaction Score',
'Point Earned']

[32]: categorical_variabls

[32]: ['Surname', 'Geography', 'Gender', 'Card Type']

[33]: unknown_variables

[33]: []

[36]: getValueCounts(data,numerical_variables)

| 1 | Tenure |
----------------------
2 1048
1 1035
7 1028
8 1025
5 1012
3 1009
4 989
9 984
6 967
10 490
0 413
Name: Tenure, dtype: int64
----------------------
| 2 | NumOfProducts |
----------------------
1 5084
2 4590
3 266
4 60
Name: NumOfProducts, dtype: int64
----------------------

10
| 3 | HasCrCard |
----------------------
1 7055
0 2945
Name: HasCrCard, dtype: int64
----------------------
| 4 | IsActiveMember |
----------------------
1 5151
0 4849
Name: IsActiveMember, dtype: int64
----------------------
| 5 | Exited |
----------------------
0 7962
1 2038
Name: Exited, dtype: int64
----------------------
| 6 | Complain |
----------------------
0 7956
1 2044
Name: Complain, dtype: int64
----------------------
| 7 | Satisfaction Score |
----------------------
3 2042
2 2014
4 2008
5 2004
1 1932
Name: Satisfaction Score, dtype: int64
----------------------

[37]: getValueCounts(data,categorical_variabls)

| 1 | Geography |
----------------------
France 5014
Germany 2509
Spain 2477
Name: Geography, dtype: int64
----------------------
| 2 | Gender |
----------------------
Male 5457
Female 4543
Name: Gender, dtype: int64

11
----------------------
| 3 | Card Type |
----------------------
DIAMOND 2507
GOLD 2502
SILVER 2496
PLATINUM 2495
Name: Card Type, dtype: int64
----------------------

[34]: createHistogramPlots(data,numerical_variables)

Saving Figure Histogram of RowNumber

Saving Figure Histogram of CustomerId

12
Saving Figure Histogram of CreditScore

13
Saving Figure Histogram of Age

14
Saving Figure Histogram of Balance

15
Saving Figure Histogram of EstimatedSalary

16
Saving Figure Histogram of Point Earned

17
[35]: createBoxPlots(data,numerical_variables)

Saving Figure Boxplot of RowNumber

18
Saving Figure Boxplot of CustomerId

19
Saving Figure Boxplot of CreditScore

20
Saving Figure Boxplot of Age

21
Saving Figure Boxplot of Balance

22
Saving Figure Boxplot of EstimatedSalary

23
Saving Figure Boxplot of Point Earned

24
[38]: createBarChart(data,numerical_variables)

| 1 | Tenure |
----------------------
Saving Figure Count Plot of Tenure

25
----------------------
| 2 | NumOfProducts |
----------------------
Saving Figure Count Plot of NumOfProducts

26
----------------------
| 3 | HasCrCard |
----------------------
Saving Figure Count Plot of HasCrCard

27
----------------------
| 4 | IsActiveMember |
----------------------
Saving Figure Count Plot of IsActiveMember

28
----------------------
| 5 | Exited |
----------------------
Saving Figure Count Plot of Exited

29
----------------------
| 6 | Complain |
----------------------
Saving Figure Count Plot of Complain

30
----------------------
| 7 | Satisfaction Score |
----------------------
Saving Figure Count Plot of Satisfaction Score

31
----------------------

[39]: createBarChart(data,categorical_variabls)

| 1 | Geography |
----------------------
Saving Figure Count Plot of Geography

32
----------------------
| 2 | Gender |
----------------------
Saving Figure Count Plot of Gender

33
----------------------
| 3 | Card Type |
----------------------
Saving Figure Count Plot of Card Type

34
----------------------

[40]: createPieChart(data,numerical_variables)

| 1 | Tenure |
----------------------
Saving Figure Pie Plot of Tenure

35
----------------------
| 2 | NumOfProducts |
----------------------
Saving Figure Pie Plot of NumOfProducts

36
----------------------
| 3 | HasCrCard |
----------------------
Saving Figure Pie Plot of HasCrCard

37
----------------------
| 4 | IsActiveMember |
----------------------
Saving Figure Pie Plot of IsActiveMember

38
----------------------
| 5 | Exited |
----------------------
Saving Figure Pie Plot of Exited

39
----------------------
| 6 | Complain |
----------------------
Saving Figure Pie Plot of Complain

40
----------------------
| 7 | Satisfaction Score |
----------------------
Saving Figure Pie Plot of Satisfaction Score

41
----------------------

[41]: createPieChart(data,categorical_variabls)

| 1 | Geography |
----------------------
Saving Figure Pie Plot of Geography

42
----------------------
| 2 | Gender |
----------------------
Saving Figure Pie Plot of Gender

43
----------------------
| 3 | Card Type |
----------------------
Saving Figure Pie Plot of Card Type

44
----------------------

[42]: getCrosstab(var1='Gender',var2='Geography',df=data)

Crosstab for Gender vs Geography


-------------------------------
Crosstab for Counts
-------------------------------
Geography France Germany Spain All
Gender
Female 2261 1193 1089 4543
Male 2753 1316 1388 5457
All 5014 2509 2477 10000
-------------------------------
Crosstab for Row wise Percentage
-------------------------------
Geography France Germany Spain All
Gender
Female 49.768875 26.260180 23.970944 100.0

45
Male 50.448965 24.115815 25.435221 100.0
All 50.140000 25.090000 24.770000 100.0
-------------------------------
Crosstab for Column wise Percentage
-------------------------------
Geography France Germany Spain All
Gender
Female 45.093738 47.548824 43.964473 45.43
Male 54.906262 52.451176 56.035527 54.57
All 100.000000 100.000000 100.000000 100.00

[43]: createCrosstabBarChart(var1="Gender", var2="Geography",␣


↪df=data,figsize=(10,10),threshold=10)

Saving Figure Stacked Bar Cgart of Crosstab for row wise % for Gender vs
Geography
<Figure size 3000x3000 with 0 Axes>

[54]: createViolinPlotwithHue(data,numerical_variables,cross_var="Geography",variable_hue="Gender",m

Saving Figure Violin Plot of RowNumber & Geography with Hue Gender

46
Saving Figure Violin Plot of CustomerId & Geography with Hue Gender

47
Saving Figure Violin Plot of CreditScore & Geography with Hue Gender

48
Saving Figure Violin Plot of Age & Geography with Hue Gender

49
Saving Figure Violin Plot of Balance & Geography with Hue Gender

50
Saving Figure Violin Plot of EstimatedSalary & Geography with Hue Gender

51
Saving Figure Violin Plot of Point Earned & Geography with Hue Gender

52
[46]: getCrosstabInExcel(var1="Gender",var2="Geography",df=data)

0.2 Distribution Check and Testing


[47]: from scipy.stats import chi2_contingency

[111]: from scipy import stats

[98]: def testOfIndependanceChisq(data,variable1,variable2,alpha=0.05):

contingency_table = pd.crosstab(data[variable1],data[variable2])
contingency_table_2 = pd.crosstab(data[variable1],data[variable2]).
↪apply(lambda r: round(r*100/data.shape[0],2),axis=0)

print("===============================================================")
print(f"Chi Sq Test of Independance for {variable1} vs {variable2} :␣
↪Results")

print("===============================================================")
print()

53
if (contingency_table<5).sum().sum()>0:
print("Chi-Square Test can't be conducted, since one of the proportion␣
↪< 5% \n")

print("Observed Contigency Table: \n",contingency_table)

else:
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("> Chi Square Statistics:", round(chi2,3))
print("> p-value:",round(p_value,3))
print("> Degree of freedom:",dof)
print()
print("---------------------------------------------------------------")

if p_value<=alpha:
print(f"p value - {round(p_value,3)} is less than alpha - {alpha} ")
print(f"Conclusion: The Variables {variable1} and {variable2} are␣
↪Dependent: Reject H0")

else:
print(f"p value - {round(p_value,3)} is greater than alpha -␣
↪{alpha} ")

print(f"Conclusion: The Variables {variable1} and {variable2} are␣


↪Independent: Unable t reject H0")

print("---------------------------------------------------------------")

[105]: def getChisqReport(data,categorical_variables,cat_var):


i = 0
for var in categorical_variables:
print()
print("============")
print("Sl No : ",i)
testOfIndependanceChisq(data,variable1=cat_var,variable2=var,alpha=0.05)
print()
i+=1

[91]: testOfIndependanceChisq(data,variable1='Gender',variable2='Geography',alpha=0.
↪05)

===============================================================
Chi Sq Test of Independance for Gender vs Geography : Results
===============================================================

> Chi Square Statistics: 6.918


> p-value: 0.031
> Degree of freedom: 2

===============================================================
p value - 0.031 is less than alpha - 0.05

54
Conclusion: The Variables Gender and Geography are Dependednt: Reject H0
===============================================================

[107]: getChisqReport(data,categorical_variabls,cat_var="Gender")

============
Sl No : 0
===============================================================
Chi Sq Test of Independance for Gender vs Surname : Results
===============================================================

Chi-Square Test can't be conducted, since one of the proportion < 5%

Observed Contigency Table:


Surname Abazu Abbie Abbott Abdullah Abdulov Abel Abernathy Abramov \
Gender
Female 1 1 2 1 2 0 2 0
Male 1 0 2 0 0 1 0 1

Surname Abramova Abramovich … Zinachukwudi Zito Zotov Zotova Zox \


Gender …
Female 1 2 … 0 2 0 0 1
Male 1 3 … 1 4 1 1 0

Surname Zubarev Zubareva Zuev Zuyev Zuyeva


Gender
Female 3 0 0 0 1
Male 0 1 1 2 1

[2 rows x 2932 columns]

============
Sl No : 1
===============================================================
Chi Sq Test of Independance for Gender vs Geography : Results
===============================================================

> Chi Square Statistics: 6.918


> p-value: 0.031
> Degree of freedom: 2

---------------------------------------------------------------
p value - 0.031 is less than alpha - 0.05
Conclusion: The Variables Gender and Geography are Dependent: Reject H0
---------------------------------------------------------------

55
============
Sl No : 2
===============================================================
Chi Sq Test of Independance for Gender vs Gender : Results
===============================================================

Chi-Square Test can't be conducted, since one of the proportion < 5%

Observed Contigency Table:


Gender Female Male
Gender
Female 4543 0
Male 0 5457

============
Sl No : 3
===============================================================
Chi Sq Test of Independance for Gender vs Card Type : Results
===============================================================

> Chi Square Statistics: 12.02


> p-value: 0.007
> Degree of freedom: 3

---------------------------------------------------------------
p value - 0.007 is less than alpha - 0.05
Conclusion: The Variables Gender and Card Type are Dependent: Reject H0
---------------------------------------------------------------

[108]: getChisqReport(data,categorical_variabls,cat_var="Geography")

============
Sl No : 0
===============================================================
Chi Sq Test of Independance for Geography vs Surname : Results
===============================================================

Chi-Square Test can't be conducted, since one of the proportion < 5%

Observed Contigency Table:


Surname Abazu Abbie Abbott Abdullah Abdulov Abel Abernathy Abramov \
Geography
France 0 1 4 1 0 1 1 1
Germany 0 0 0 0 1 0 0 0

56
Spain 2 0 0 0 1 0 1 0

Surname Abramova Abramovich … Zinachukwudi Zito Zotov Zotova Zox \


Geography …
France 0 2 … 1 1 1 1 0
Germany 1 0 … 0 3 0 0 1
Spain 1 3 … 0 2 0 0 0

Surname Zubarev Zubareva Zuev Zuyev Zuyeva


Geography
France 3 1 1 0 0
Germany 0 0 0 1 1
Spain 0 0 0 1 1

[3 rows x 2932 columns]

============
Sl No : 1
===============================================================
Chi Sq Test of Independance for Geography vs Geography : Results
===============================================================

Chi-Square Test can't be conducted, since one of the proportion < 5%

Observed Contigency Table:


Geography France Germany Spain
Geography
France 5014 0 0
Germany 0 2509 0
Spain 0 0 2477

============
Sl No : 2
===============================================================
Chi Sq Test of Independance for Geography vs Gender : Results
===============================================================

> Chi Square Statistics: 6.918


> p-value: 0.031
> Degree of freedom: 2

---------------------------------------------------------------
p value - 0.031 is less than alpha - 0.05
Conclusion: The Variables Geography and Gender are Dependent: Reject H0
---------------------------------------------------------------

57
============
Sl No : 3
===============================================================
Chi Sq Test of Independance for Geography vs Card Type : Results
===============================================================

> Chi Square Statistics: 5.404


> p-value: 0.493
> Degree of freedom: 6

---------------------------------------------------------------
p value - 0.493 is greater than alpha - 0.05
Conclusion: The Variables Geography and Card Type are Independent: Unable t
reject H0
---------------------------------------------------------------

[141]: def test_normality(data,variable,alpha=0.05):


x=data[variable]

shapiro_stat,shapiro_pvalue = stats.shapiro(x)
print("=======================")
print(f"Normality Test for {variable}")
print("=======================")
print()
print("=======================")
print("Shapiro-wilk Test: ")
print("=======================")
print()
print("> Test Statistic: ",round(shapiro_stat,3))
print("> p-value:", round(shapiro_pvalue,3))
print()
print("Conclusion:")
print("-------------")
if shapiro_pvalue>alpha:
print(f"p value - {round(shapiro_pvalue,3)} is greater than alpha -␣
↪{alpha} ")

print(f"The Variable {variable} follows the Normal Distribution")


else:
print(f"p value - {round(shapiro_pvalue,3)} is less than alpha -␣
↪{alpha} ")

print(f"The Variable {variable} does not follow the Normal␣


↪Distribution")

print()
print("=======================")

58
print("Kolmogorov-Smirnov Test: ")
print("=======================")
print()
ks_stat, ks_pvalue = stats.kstest(x,'norm')
print("Test Statistic:", round(ks_stat,3))
print("p-value:",round(ks_pvalue,3))
print()
print("Conclusion:")
print("-------------")
if ks_pvalue>alpha:
print(f"p value - {round(ks_pvalue,3)} is greater than alpha - {alpha}␣
↪")

print(f"The Variable {variable} follows the Normal Distribution")


else:
print(f"p value - {round(ks_pvalue,3)} is less than alpha - {alpha} ")
print(f"The Variable {variable} does not follow the Normal␣
↪Distribution")

print()

anderson_stat, anderson_critical_values, anderson_significance_levels =␣


↪stats.anderson(x,dist="norm")

print()
print("=======================")
print("Anderson-Darling Test: ")
print("=======================")
print("Test Statistics:", round(anderson_stat,3))
print("Critical Values: ", anderson_critical_values)
print("Significance Levels: ", anderson_significance_levels/100)
print()
print("Conclusion:")
print("-------------")
if anderson_stat>anderson_critical_values[2]:
print(f"AD Statistics - {round(anderson_stat,3)} is greater than alpha␣
↪- {anderson_critical_values[2]} at {anderson_significance_levels[2]/100}␣

↪significance level")

print(f"The Variable {variable} does not follow the Normal␣


↪Distribution")

else:
print(f"AD Statistics - {round(anderson_stat,3)} is greater than alpha␣
↪- {anderson_critical_values[2]} at {anderson_significance_levels[2]/100}␣

↪significance level")

print(f"The Variable {variable} follows the Normal Distribution")


print()

[142]: test_normality(data,variable="CreditScore",alpha=0.05)

=======================

59
Normality Test for CreditScore
=======================

=======================
Shapiro-wilk Test:
=======================

> Test Statistic: 0.994


> p-value: 0.0

Conclusion:
-------------
p value - 0.0 is less than alpha - 0.05
The Variable CreditScore does not follow the Normal Distribution

=======================
Kolmogorov-Smirnov Test:
=======================

Test Statistic: 1.0


p-value: 0.0

Conclusion:
-------------
p value - 0.0 is less than alpha - 0.05
The Variable CreditScore does not follow the Normal Distribution

=======================
Anderson-Darling Test:
=======================
Test Statistics: 5.458
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [0.15 0.1 0.05 0.025 0.01 ]

Conclusion:
-------------
AD Statistics - 5.458 is greater than alpha - 0.787 at 0.05 significance level
The Variable CreditScore does not follow the Normal Distribution

[162]: import statsmodels.graphics.gofplots as sm

[219]: dist_names = ['beta',


'expon',
'gamma',
'lognorm',

60
'norm',
'pearson3',
'triang',
'uniform',
'weibull_min',
'weibull_max']

[236]: def createQQPlotforNormal(data,var):


# sm.ProbPlot(measurements).qqplot(line='s')
# plt.show()
measurements = data[var]
stats.probplot(measurements, dist="norm", plot=pylab)
pylab.show()

[237]: createQQPlotforNormal(data,var="CreditScore")

[215]: import statsmodels.api as sm


sm.qqplot(data["CreditScore"],line='45',fit=True,dist=stats.norm)
plt.show()

61
[150]: # imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sc
import statsmodels.graphics.gofplots as sm

# define distributions
sample_size = 10000
standard_norm = np.random.normal(size=sample_size)
heavy_tailed_norm = np.random.normal(loc=0, scale=2, size=sample_size)
skewed_norm = sc.skewnorm.rvs(a=5, size=sample_size)
skew_left_norm = sc.skewnorm.rvs(a=-5, size=sample_size)

# plots for standard distribution


fig, ax = plt.subplots(1, 2, figsize=(12, 7))
sns.histplot(standard_norm,kde=True, color ='blue',ax=ax[0])
sm.ProbPlot(standard_norm).qqplot(line='s', ax=ax[1])

# plot for right-tailed distribution

62
fig, ax = plt.subplots(1, 2, figsize=(12, 7))
sm.ProbPlot(skewed_norm).qqplot(line='s', ax=ax[1])
sns.histplot(skewed_norm,kde=True, color ='blue',ax=ax[0])

# plot for left-tailed distribution


fig, ax = plt.subplots(1, 2, figsize=(12, 7))
sm.ProbPlot(skew_left_norm).qqplot(line='s',color='red', ax=ax[1])
sns.histplot(skew_left_norm,kde=True, color ='red',ax=ax[0])

# plot for heavy tailed distribution


fig, ax = plt.subplots(1, 2, figsize=(12, 7))
sm.ProbPlot(heavy_tailed_norm).qqplot(line='s',color ='green', ax=ax[1])
sns.histplot(heavy_tailed_norm,kde=True, color ='green',ax=ax[0])
sns.histplot(standard_norm,kde=True, color ='red',ax=ax[0])

[150]: <AxesSubplot:ylabel='Count'>

63
64
[218]: #=====================================
# Fitting for distribution
#=====================================
import scipy
from sklearn.preprocessing import StandardScaler
import scipy.stats
import warnings
warnings.filterwarnings("ignore")

[231]: def fitDistributions(data,var,dist_names,nbins=50,alpha=0.05):


y=np.array(data[var])
x = np.arange(len(y))
size = len(y)

sc=StandardScaler()
yy = y.reshape (-1,1)
sc.fit(yy)
y_std =sc.transform(yy)
y_std = y_std.flatten()

# Set up empty lists to stroe results


chi_square = []
p_values = []

# Set up 50 bins for chi-square test

65
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,nbins)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)

# Loop through candidate distributions


for distribution in dist_names:
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats, distribution)
param = dist.fit(y_std)

# Obtain the KS test P statistic, round it to 5 decimal places


p = scipy.stats.kstest(y_std, distribution, args=param)[1]
p = np.around(p, 5)
p_values.append(p)

# Get expected counts in percentile bins


# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2],
scale=param[-1])
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
expected_frequency.append(expected_cdf_area)

# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) /␣
↪cum_observed_frequency)

chi_square.append(ss)

# Collate results and sort by goodness of fit (best at top)

results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results['alpha']=alpha
results['conclusion']=results['p_value'].apply(lambda x: "Does Not Follow"␣
↪if x<alpha else "Follows")

results.sort_values(['chi_square'], inplace=True)
results=results.reset_index(drop=True)
return results

66
[235]: fitDistributions(data,var="CreditScore",dist_names=dist_names,nbins=50,alpha=0.
↪05)

[235]: Distribution chi_square p_value alpha conclusion


0 weibull_min 18.440621 0.01984 0.05 Does Not Follow
1 weibull_max 27.721445 0.03535 0.05 Does Not Follow
2 beta 50.066628 0.00261 0.05 Does Not Follow
3 pearson3 78.997425 0.00735 0.05 Does Not Follow
4 norm 85.628738 0.00097 0.05 Does Not Follow
5 lognorm 102.801133 0.00057 0.05 Does Not Follow
6 gamma 108.952496 0.00028 0.05 Does Not Follow
7 triang 5551.148172 0.00000 0.05 Does Not Follow
8 uniform 84810.244809 0.00000 0.05 Does Not Follow
9 expon 165161.908490 0.00000 0.05 Does Not Follow

[276]: import numpy as np


from scipy.stats import ks_2samp

def are_samples_from_same_distribution(sample1, sample2, alpha=0.05):


"""
Perform the Kolmogorov-Smirnov test to check if two samples come from the␣
↪same distribution.

Parameters:
sample1 (array-like): First sample data.
sample2 (array-like): Second sample data.
alpha (float): Significance level for the test (default is 0.05).

Returns:
(bool, float): A tuple containing the result of the test (True if same␣
↪distribution, False otherwise)

and the p-value of the test.


"""
stat, p_value = ks_2samp(sample1, sample2)
result = p_value > alpha
return result, p_value

# Example usage:
sample1 = np.random.normal(loc=0, scale=1, size=1000)
sample2 = np.random.normal(loc=0, scale=1, size=1000)

result, p_value = are_samples_from_same_distribution(sample1, sample2)

if result:
print("The two samples come from the same distribution.")
else:
print("The two samples do not come from the same distribution.")

67
print(f"P-value: {p_value}")

The two samples come from the same distribution.


P-value: 0.9357699014782725

[287]: def␣
↪test_continuous_categorical_ANOVA(data,continuous_variable,categorical_variable,alpha=0.

↪05):

groups = [group for _,group in data.groupby(categorical_variable)]


statistics, p_value = stats.f_oneway(*[group[continuous_variable] for group␣
↪in groups])

print("---------------------------------------------------------------")
print(f"One-Way ANOVA Test of between {continuous_variable} vs␣
↪{categorical_variable} : Results")

print("---------------------------------------------------------------")
# print(groups)
print("> F Statistics:", round(statistics,3))
print("> p-value:",round(p_value,3))
print()

if p_value<=alpha:
print(f"Conclusion: The Variables {continuous_variable} and␣
↪{categorical_variable} are Dependent: Reject H0")

else:
print(f"Conclusion: The Variables {continuous_variable} and␣
↪{categorical_variable} are Independent: Unable to reject H0")

[288]: test_continuous_categorical_ANOVA(data,continuous_variable='Balance',categorical_variable='Gen
↪05)

---------------------------------------------------------------
One-Way ANOVA Test of between Balance vs Gender : Results
---------------------------------------------------------------
> F Statistics: 1.461
> p-value: 0.227

Conclusion: The Variables Balance and Gender are Independent: Unable to reject
H0

[289]: def test_continuous_relationship(data,variable1,variable2,method='pearson'):


x=data[variable1]
y=data[variable2]

if method=='pearson':
correlation_coef,p_value = stats.pearsonr(x,y)
elif method=='spearman':
correlation_coef,p_value = stats.spearmanr(x,y)

68
else:
raise ValueError("invalid correlation method specified. Choose␣
↪'pearson' or 'spearman'")

print("correlation Coefficient: ", round(correlation_coef,3))


print("p-value:", round(p_value,3))

[297]: test_continuous_relationship(data,variable1="CreditScore",variable2="Point␣
↪Earned",method='pearson')

correlation Coefficient: 0.0


p-value: 0.994

0.3 Outlier Detection


[298]: def find_iqr(x):
return np.subtract(*np.nanquantile(x,[0.75,0.25]))

[335]: # def outlier_cap_upper(x,upper):


# if x>upper:
# return upper
# else:
# return x

[336]: def is_outlier_present(max_value,min_value,upper,lower):


if max_value>upper or min_value<lower:
return 'Outlier Present'
else:
return 'No Outlier'

[337]: def getOutlierInformation(data,numerical_variables,minSamples=30):


outlier_df = []
for var in numerical_variables:
if data[var].nunique()>minSamples:
third_quantile = np.nanquantile(data[var],0.75)
first_quantile = np.nanquantile(data[var],0.25)
iqr = third_quantile - first_quantile
upper = round(third_quantile + 1.5*iqr,3)
lower = round(first_quantile - 1.5*iqr,3)

max_value = data[var].max()
min_value = data[var].min()

outlier_conclusion =␣
↪is_outlier_present(max_value,min_value,upper,lower)
outlier_df.
↪append([var,upper,lower,max_value,min_value,outlier_conclusion])

69
outlier_df = pd.DataFrame(outlier_df,columns=['Variable','Upper␣
↪ Limit','Lower Limit','Max Value', 'Min Value', 'Conclusion'])

return outlier_df

[338]: getOutlierInformation(data,numerical_variables,minSamples=30)

[338]: Variable Upper Limit Lower Limit Max Value Min Value \
0 RowNumber 1.499950e+04 -4.998500e+03 10000.00 1.00
1 CustomerId 1.594029e+07 1.544147e+07 15815690.00 15565701.00
2 CreditScore 9.190000e+02 3.830000e+02 850.00 350.00
3 Age 6.200000e+01 1.400000e+01 92.00 18.00
4 Balance 3.191106e+05 -1.914664e+05 250898.09 0.00
5 EstimatedSalary 2.969675e+05 -9.657710e+04 199992.48 11.58
6 Point Earned 1.387500e+03 -1.765000e+02 1000.00 119.00

Conclusion
0 No Outlier
1 No Outlier
2 Outlier Present
3 Outlier Present
4 No Outlier
5 No Outlier
6 No Outlier

0.4 Logging Method


[56]: import os
import sys
import datetime
import logging

'''
This file configures logging method
'''

root_dir = "."
log_file_path = os.path.join(root_dir, 'logs')
os.makedirs(log_file_path, exist_ok=True)

timeStamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
date_of_script_run = datetime.datetime.today().strftime("%d/%m/%y")
log_file_name = log_file_path + '/' + f'log_customer_segmentation_{timeStamp}'

def genLogger(log_file_name, gen_console_log=False, gen_file_log=True):

70
"""
This file creates logger
:param log_file_name:
:param gen_console_log:
:param gen_file_log:
:return:
"""
logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_formatter = logging.Formatter("(asctime)s [%(levelname)-5.5s]␣
↪%(message)s")

console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(log_formatter)

file_handler = logging.FileHandler(f"{log_file_name}.log", mode="a")


file_handler.setFormatter(log_formatter)

if not logger.hasHandlers():
if gen_console_log:
logger.addHandler(console_handler)
if gen_file_log:
logger.addHandler(file_handler)
return logger

def closeLogger(logger):
"""
This function closes an open handler
:param logger:
:return:
"""
for handler in list(logger.handlers):
handler.close()
logger.removeHandler(handler)

# logger = genLogger(log_file_name, gen_console_log=True, gen_file_log=True)


# closeLogger(logger)

71

You might also like