Data Analysis Report With EDA
Data Analysis Report With EDA
August 6, 2023
[1]: import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
output['Count']=df.shape[0]
return output
1
output.append([var,df[var].isnull().sum(),round(df[var].isnull().
↪ sum()*100/df.shape[0],2)])
return pd.DataFrame(output,columns=['Columns','Null_Count','Null_Percent'])
temp_2=pd.DataFrame(df[var].value_counts().head(1).reset_index())
temp['Columns']=var
temp['Mode']=temp['index']
temp['Frequency']=temp_2[var]
temp['Frequency_%']=temp[var]
temp=temp.drop(['index',var],axis=1)
output=pd.concat([output,temp],axis=0)
output=output.reset_index()
output=output.drop('index',axis=1)
return output
data_quality = data_quality.merge(getNullReport(df),on='Columns',how='left')
data_quality = data_quality.
↪merge(getDescribeOfNumericCols(df),on='Columns',how='left')
data_quality = data_quality.merge(getModeValues(df),on='Columns',how='left')
return data_quality
2
11 IsActiveMember int64 10000 2 0 0.0
12 EstimatedSalary float64 10000 9999 0 0.0
13 Exited int64 10000 2 0 0.0
14 Complain int64 10000 2 0 0.0
15 Satisfaction Score int64 10000 5 0 0.0
16 Card Type object 10000 4 0 0.0
17 Point Earned int64 10000 785 0 0.0
3
[55]: data_quality.to_csv('Data_Quality_Report.csv',index=False)
0.1 EDA
[12]: import os
root_dir = "."
image_path = os.path.join(root_dir,"images")
os.makedirs(image_path,exist_ok=True)
[13]: import os
root_dir = "."
excel_path = os.path.join(root_dir,"excel")
os.makedirs(excel_path,exist_ok=True)
4
print("Crosstab for Row wise Percentage")
print("-------------------------------")
print(pd.crosstab(df[var1],df[var2],margins=True).apply(lambda r: r*200/r.
↪sum(),axis=1))
print("-------------------------------")
print("Crosstab for Column wise Percentage")
print("-------------------------------")
print(pd.crosstab(df[var1],df[var2],margins=True).apply(lambda r: r*200/r.
↪sum(),axis=0))
[19]: def␣
↪createHistogramPlotsWithHue(df,numerical_variables,variabl_hue,minSamples=30):
sns.kdeplot(data.loc[data['Survived']==1,var], shade=True,␣
↪color="c", label='Survived', alpha=.6)
5
plt.title(f"Density plot of {var}")
save_fig(f"Density plot of {var}")
plt.show()
# textcoords="offset points",␣
ha='center',va='center', rotation=90)
↪
6
save_fig(f'Count Plot of {var}')
plt.show()
print("----------------------")
[24]: def␣
↪createBarChartWithHue(df,varibleList,variable_hue,maxSamples=30,threshold=10):
sl_no=0
for var in varibleList:
if df[var].nunique()<maxSamples:
sl_no+=1
print(f"| {sl_no} | {var} |")
print("----------------------")
ax = sns.countplot(x=var, data=df,hue=variable_hue)
ax.legend(title=variable_hue, bbox_to_anchor=(1,1.02),loc='upper␣
↪left')
df[var].value_counts().plot(kind="pie",startangle=0,
autopct=custom_autocpt,ylabel='',
labeldistance=None,fontsize=10)
7
[27]: def␣
↪createViolinPlotwithHue(df,numerical_variables,cross_var,variable_hue,minSamples=30):
plt.show()
textcoords="offset points",␣
↪ha="center",va="center",rotation=90)
save_fig(f"Stacked Bar Cgart of Crosstab for row wise % for {var1} vs␣
↪{var2}")
plt.show()
8
0.1.2 Excel Crosstab
[45]: from openpyxl import Workbook
def getCrosstabInExcel(var1,var2,df,cmap='YlOrRd'):
ct1 = pd.crosstab(df[var1],df[var2]).style.
↪background_gradient(cmap=cmap,axis=None)
start_row=1
ct1.to_excel(writer,sheet_name="Sheet",index=True,startrow=start_row)
start_row= start_row+len(ct1.data)+3
text2 = pd.DataFrame({"Text":["Crosstab for Row wise Percentage"]})
text2.
↪to_excel(writer,sheet_name="Sheet",index=False,header=False,startrow=start_row)
start_row=start_row+1
ct2.to_excel(writer,sheet_name="Sheet",index=True,startrow=start_row)
start_row= start_row+len(ct2.data)+3
text3 = pd.DataFrame({"Text":["Crosstab for Column wise Percentage"]})
text3.
↪to_excel(writer,sheet_name="Sheet",index=False,header=False,startrow=start_row)
start_row=start_row+1
ct3.to_excel(writer,sheet_name="Sheet",index=True,startrow=start_row)
writer.save()
[31]: numerical_variables
9
[31]: ['RowNumber',
'CustomerId',
'CreditScore',
'Age',
'Tenure',
'Balance',
'NumOfProducts',
'HasCrCard',
'IsActiveMember',
'EstimatedSalary',
'Exited',
'Complain',
'Satisfaction Score',
'Point Earned']
[32]: categorical_variabls
[33]: unknown_variables
[33]: []
[36]: getValueCounts(data,numerical_variables)
| 1 | Tenure |
----------------------
2 1048
1 1035
7 1028
8 1025
5 1012
3 1009
4 989
9 984
6 967
10 490
0 413
Name: Tenure, dtype: int64
----------------------
| 2 | NumOfProducts |
----------------------
1 5084
2 4590
3 266
4 60
Name: NumOfProducts, dtype: int64
----------------------
10
| 3 | HasCrCard |
----------------------
1 7055
0 2945
Name: HasCrCard, dtype: int64
----------------------
| 4 | IsActiveMember |
----------------------
1 5151
0 4849
Name: IsActiveMember, dtype: int64
----------------------
| 5 | Exited |
----------------------
0 7962
1 2038
Name: Exited, dtype: int64
----------------------
| 6 | Complain |
----------------------
0 7956
1 2044
Name: Complain, dtype: int64
----------------------
| 7 | Satisfaction Score |
----------------------
3 2042
2 2014
4 2008
5 2004
1 1932
Name: Satisfaction Score, dtype: int64
----------------------
[37]: getValueCounts(data,categorical_variabls)
| 1 | Geography |
----------------------
France 5014
Germany 2509
Spain 2477
Name: Geography, dtype: int64
----------------------
| 2 | Gender |
----------------------
Male 5457
Female 4543
Name: Gender, dtype: int64
11
----------------------
| 3 | Card Type |
----------------------
DIAMOND 2507
GOLD 2502
SILVER 2496
PLATINUM 2495
Name: Card Type, dtype: int64
----------------------
[34]: createHistogramPlots(data,numerical_variables)
12
Saving Figure Histogram of CreditScore
13
Saving Figure Histogram of Age
14
Saving Figure Histogram of Balance
15
Saving Figure Histogram of EstimatedSalary
16
Saving Figure Histogram of Point Earned
17
[35]: createBoxPlots(data,numerical_variables)
18
Saving Figure Boxplot of CustomerId
19
Saving Figure Boxplot of CreditScore
20
Saving Figure Boxplot of Age
21
Saving Figure Boxplot of Balance
22
Saving Figure Boxplot of EstimatedSalary
23
Saving Figure Boxplot of Point Earned
24
[38]: createBarChart(data,numerical_variables)
| 1 | Tenure |
----------------------
Saving Figure Count Plot of Tenure
25
----------------------
| 2 | NumOfProducts |
----------------------
Saving Figure Count Plot of NumOfProducts
26
----------------------
| 3 | HasCrCard |
----------------------
Saving Figure Count Plot of HasCrCard
27
----------------------
| 4 | IsActiveMember |
----------------------
Saving Figure Count Plot of IsActiveMember
28
----------------------
| 5 | Exited |
----------------------
Saving Figure Count Plot of Exited
29
----------------------
| 6 | Complain |
----------------------
Saving Figure Count Plot of Complain
30
----------------------
| 7 | Satisfaction Score |
----------------------
Saving Figure Count Plot of Satisfaction Score
31
----------------------
[39]: createBarChart(data,categorical_variabls)
| 1 | Geography |
----------------------
Saving Figure Count Plot of Geography
32
----------------------
| 2 | Gender |
----------------------
Saving Figure Count Plot of Gender
33
----------------------
| 3 | Card Type |
----------------------
Saving Figure Count Plot of Card Type
34
----------------------
[40]: createPieChart(data,numerical_variables)
| 1 | Tenure |
----------------------
Saving Figure Pie Plot of Tenure
35
----------------------
| 2 | NumOfProducts |
----------------------
Saving Figure Pie Plot of NumOfProducts
36
----------------------
| 3 | HasCrCard |
----------------------
Saving Figure Pie Plot of HasCrCard
37
----------------------
| 4 | IsActiveMember |
----------------------
Saving Figure Pie Plot of IsActiveMember
38
----------------------
| 5 | Exited |
----------------------
Saving Figure Pie Plot of Exited
39
----------------------
| 6 | Complain |
----------------------
Saving Figure Pie Plot of Complain
40
----------------------
| 7 | Satisfaction Score |
----------------------
Saving Figure Pie Plot of Satisfaction Score
41
----------------------
[41]: createPieChart(data,categorical_variabls)
| 1 | Geography |
----------------------
Saving Figure Pie Plot of Geography
42
----------------------
| 2 | Gender |
----------------------
Saving Figure Pie Plot of Gender
43
----------------------
| 3 | Card Type |
----------------------
Saving Figure Pie Plot of Card Type
44
----------------------
[42]: getCrosstab(var1='Gender',var2='Geography',df=data)
45
Male 50.448965 24.115815 25.435221 100.0
All 50.140000 25.090000 24.770000 100.0
-------------------------------
Crosstab for Column wise Percentage
-------------------------------
Geography France Germany Spain All
Gender
Female 45.093738 47.548824 43.964473 45.43
Male 54.906262 52.451176 56.035527 54.57
All 100.000000 100.000000 100.000000 100.00
Saving Figure Stacked Bar Cgart of Crosstab for row wise % for Gender vs
Geography
<Figure size 3000x3000 with 0 Axes>
[54]: createViolinPlotwithHue(data,numerical_variables,cross_var="Geography",variable_hue="Gender",m
Saving Figure Violin Plot of RowNumber & Geography with Hue Gender
46
Saving Figure Violin Plot of CustomerId & Geography with Hue Gender
47
Saving Figure Violin Plot of CreditScore & Geography with Hue Gender
48
Saving Figure Violin Plot of Age & Geography with Hue Gender
49
Saving Figure Violin Plot of Balance & Geography with Hue Gender
50
Saving Figure Violin Plot of EstimatedSalary & Geography with Hue Gender
51
Saving Figure Violin Plot of Point Earned & Geography with Hue Gender
52
[46]: getCrosstabInExcel(var1="Gender",var2="Geography",df=data)
contingency_table = pd.crosstab(data[variable1],data[variable2])
contingency_table_2 = pd.crosstab(data[variable1],data[variable2]).
↪apply(lambda r: round(r*100/data.shape[0],2),axis=0)
print("===============================================================")
print(f"Chi Sq Test of Independance for {variable1} vs {variable2} :␣
↪Results")
print("===============================================================")
print()
53
if (contingency_table<5).sum().sum()>0:
print("Chi-Square Test can't be conducted, since one of the proportion␣
↪< 5% \n")
else:
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("> Chi Square Statistics:", round(chi2,3))
print("> p-value:",round(p_value,3))
print("> Degree of freedom:",dof)
print()
print("---------------------------------------------------------------")
if p_value<=alpha:
print(f"p value - {round(p_value,3)} is less than alpha - {alpha} ")
print(f"Conclusion: The Variables {variable1} and {variable2} are␣
↪Dependent: Reject H0")
else:
print(f"p value - {round(p_value,3)} is greater than alpha -␣
↪{alpha} ")
print("---------------------------------------------------------------")
[91]: testOfIndependanceChisq(data,variable1='Gender',variable2='Geography',alpha=0.
↪05)
===============================================================
Chi Sq Test of Independance for Gender vs Geography : Results
===============================================================
===============================================================
p value - 0.031 is less than alpha - 0.05
54
Conclusion: The Variables Gender and Geography are Dependednt: Reject H0
===============================================================
[107]: getChisqReport(data,categorical_variabls,cat_var="Gender")
============
Sl No : 0
===============================================================
Chi Sq Test of Independance for Gender vs Surname : Results
===============================================================
============
Sl No : 1
===============================================================
Chi Sq Test of Independance for Gender vs Geography : Results
===============================================================
---------------------------------------------------------------
p value - 0.031 is less than alpha - 0.05
Conclusion: The Variables Gender and Geography are Dependent: Reject H0
---------------------------------------------------------------
55
============
Sl No : 2
===============================================================
Chi Sq Test of Independance for Gender vs Gender : Results
===============================================================
============
Sl No : 3
===============================================================
Chi Sq Test of Independance for Gender vs Card Type : Results
===============================================================
---------------------------------------------------------------
p value - 0.007 is less than alpha - 0.05
Conclusion: The Variables Gender and Card Type are Dependent: Reject H0
---------------------------------------------------------------
[108]: getChisqReport(data,categorical_variabls,cat_var="Geography")
============
Sl No : 0
===============================================================
Chi Sq Test of Independance for Geography vs Surname : Results
===============================================================
56
Spain 2 0 0 0 1 0 1 0
============
Sl No : 1
===============================================================
Chi Sq Test of Independance for Geography vs Geography : Results
===============================================================
============
Sl No : 2
===============================================================
Chi Sq Test of Independance for Geography vs Gender : Results
===============================================================
---------------------------------------------------------------
p value - 0.031 is less than alpha - 0.05
Conclusion: The Variables Geography and Gender are Dependent: Reject H0
---------------------------------------------------------------
57
============
Sl No : 3
===============================================================
Chi Sq Test of Independance for Geography vs Card Type : Results
===============================================================
---------------------------------------------------------------
p value - 0.493 is greater than alpha - 0.05
Conclusion: The Variables Geography and Card Type are Independent: Unable t
reject H0
---------------------------------------------------------------
shapiro_stat,shapiro_pvalue = stats.shapiro(x)
print("=======================")
print(f"Normality Test for {variable}")
print("=======================")
print()
print("=======================")
print("Shapiro-wilk Test: ")
print("=======================")
print()
print("> Test Statistic: ",round(shapiro_stat,3))
print("> p-value:", round(shapiro_pvalue,3))
print()
print("Conclusion:")
print("-------------")
if shapiro_pvalue>alpha:
print(f"p value - {round(shapiro_pvalue,3)} is greater than alpha -␣
↪{alpha} ")
print()
print("=======================")
58
print("Kolmogorov-Smirnov Test: ")
print("=======================")
print()
ks_stat, ks_pvalue = stats.kstest(x,'norm')
print("Test Statistic:", round(ks_stat,3))
print("p-value:",round(ks_pvalue,3))
print()
print("Conclusion:")
print("-------------")
if ks_pvalue>alpha:
print(f"p value - {round(ks_pvalue,3)} is greater than alpha - {alpha}␣
↪")
print()
print()
print("=======================")
print("Anderson-Darling Test: ")
print("=======================")
print("Test Statistics:", round(anderson_stat,3))
print("Critical Values: ", anderson_critical_values)
print("Significance Levels: ", anderson_significance_levels/100)
print()
print("Conclusion:")
print("-------------")
if anderson_stat>anderson_critical_values[2]:
print(f"AD Statistics - {round(anderson_stat,3)} is greater than alpha␣
↪- {anderson_critical_values[2]} at {anderson_significance_levels[2]/100}␣
↪significance level")
else:
print(f"AD Statistics - {round(anderson_stat,3)} is greater than alpha␣
↪- {anderson_critical_values[2]} at {anderson_significance_levels[2]/100}␣
↪significance level")
[142]: test_normality(data,variable="CreditScore",alpha=0.05)
=======================
59
Normality Test for CreditScore
=======================
=======================
Shapiro-wilk Test:
=======================
Conclusion:
-------------
p value - 0.0 is less than alpha - 0.05
The Variable CreditScore does not follow the Normal Distribution
=======================
Kolmogorov-Smirnov Test:
=======================
Conclusion:
-------------
p value - 0.0 is less than alpha - 0.05
The Variable CreditScore does not follow the Normal Distribution
=======================
Anderson-Darling Test:
=======================
Test Statistics: 5.458
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [0.15 0.1 0.05 0.025 0.01 ]
Conclusion:
-------------
AD Statistics - 5.458 is greater than alpha - 0.787 at 0.05 significance level
The Variable CreditScore does not follow the Normal Distribution
60
'norm',
'pearson3',
'triang',
'uniform',
'weibull_min',
'weibull_max']
[237]: createQQPlotforNormal(data,var="CreditScore")
61
[150]: # imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sc
import statsmodels.graphics.gofplots as sm
# define distributions
sample_size = 10000
standard_norm = np.random.normal(size=sample_size)
heavy_tailed_norm = np.random.normal(loc=0, scale=2, size=sample_size)
skewed_norm = sc.skewnorm.rvs(a=5, size=sample_size)
skew_left_norm = sc.skewnorm.rvs(a=-5, size=sample_size)
62
fig, ax = plt.subplots(1, 2, figsize=(12, 7))
sm.ProbPlot(skewed_norm).qqplot(line='s', ax=ax[1])
sns.histplot(skewed_norm,kde=True, color ='blue',ax=ax[0])
[150]: <AxesSubplot:ylabel='Count'>
63
64
[218]: #=====================================
# Fitting for distribution
#=====================================
import scipy
from sklearn.preprocessing import StandardScaler
import scipy.stats
import warnings
warnings.filterwarnings("ignore")
sc=StandardScaler()
yy = y.reshape (-1,1)
sc.fit(yy)
y_std =sc.transform(yy)
y_std = y_std.flatten()
65
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,nbins)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) /␣
↪cum_observed_frequency)
chi_square.append(ss)
results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results['alpha']=alpha
results['conclusion']=results['p_value'].apply(lambda x: "Does Not Follow"␣
↪if x<alpha else "Follows")
results.sort_values(['chi_square'], inplace=True)
results=results.reset_index(drop=True)
return results
66
[235]: fitDistributions(data,var="CreditScore",dist_names=dist_names,nbins=50,alpha=0.
↪05)
Parameters:
sample1 (array-like): First sample data.
sample2 (array-like): Second sample data.
alpha (float): Significance level for the test (default is 0.05).
Returns:
(bool, float): A tuple containing the result of the test (True if same␣
↪distribution, False otherwise)
# Example usage:
sample1 = np.random.normal(loc=0, scale=1, size=1000)
sample2 = np.random.normal(loc=0, scale=1, size=1000)
if result:
print("The two samples come from the same distribution.")
else:
print("The two samples do not come from the same distribution.")
67
print(f"P-value: {p_value}")
[287]: def␣
↪test_continuous_categorical_ANOVA(data,continuous_variable,categorical_variable,alpha=0.
↪05):
print("---------------------------------------------------------------")
print(f"One-Way ANOVA Test of between {continuous_variable} vs␣
↪{categorical_variable} : Results")
print("---------------------------------------------------------------")
# print(groups)
print("> F Statistics:", round(statistics,3))
print("> p-value:",round(p_value,3))
print()
if p_value<=alpha:
print(f"Conclusion: The Variables {continuous_variable} and␣
↪{categorical_variable} are Dependent: Reject H0")
else:
print(f"Conclusion: The Variables {continuous_variable} and␣
↪{categorical_variable} are Independent: Unable to reject H0")
[288]: test_continuous_categorical_ANOVA(data,continuous_variable='Balance',categorical_variable='Gen
↪05)
---------------------------------------------------------------
One-Way ANOVA Test of between Balance vs Gender : Results
---------------------------------------------------------------
> F Statistics: 1.461
> p-value: 0.227
Conclusion: The Variables Balance and Gender are Independent: Unable to reject
H0
if method=='pearson':
correlation_coef,p_value = stats.pearsonr(x,y)
elif method=='spearman':
correlation_coef,p_value = stats.spearmanr(x,y)
68
else:
raise ValueError("invalid correlation method specified. Choose␣
↪'pearson' or 'spearman'")
[297]: test_continuous_relationship(data,variable1="CreditScore",variable2="Point␣
↪Earned",method='pearson')
max_value = data[var].max()
min_value = data[var].min()
outlier_conclusion =␣
↪is_outlier_present(max_value,min_value,upper,lower)
outlier_df.
↪append([var,upper,lower,max_value,min_value,outlier_conclusion])
69
outlier_df = pd.DataFrame(outlier_df,columns=['Variable','Upper␣
↪ Limit','Lower Limit','Max Value', 'Min Value', 'Conclusion'])
return outlier_df
[338]: getOutlierInformation(data,numerical_variables,minSamples=30)
[338]: Variable Upper Limit Lower Limit Max Value Min Value \
0 RowNumber 1.499950e+04 -4.998500e+03 10000.00 1.00
1 CustomerId 1.594029e+07 1.544147e+07 15815690.00 15565701.00
2 CreditScore 9.190000e+02 3.830000e+02 850.00 350.00
3 Age 6.200000e+01 1.400000e+01 92.00 18.00
4 Balance 3.191106e+05 -1.914664e+05 250898.09 0.00
5 EstimatedSalary 2.969675e+05 -9.657710e+04 199992.48 11.58
6 Point Earned 1.387500e+03 -1.765000e+02 1000.00 119.00
Conclusion
0 No Outlier
1 No Outlier
2 Outlier Present
3 Outlier Present
4 No Outlier
5 No Outlier
6 No Outlier
'''
This file configures logging method
'''
root_dir = "."
log_file_path = os.path.join(root_dir, 'logs')
os.makedirs(log_file_path, exist_ok=True)
timeStamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
date_of_script_run = datetime.datetime.today().strftime("%d/%m/%y")
log_file_name = log_file_path + '/' + f'log_customer_segmentation_{timeStamp}'
70
"""
This file creates logger
:param log_file_name:
:param gen_console_log:
:param gen_file_log:
:return:
"""
logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_formatter = logging.Formatter("(asctime)s [%(levelname)-5.5s]␣
↪%(message)s")
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(log_formatter)
if not logger.hasHandlers():
if gen_console_log:
logger.addHandler(console_handler)
if gen_file_log:
logger.addHandler(file_handler)
return logger
def closeLogger(logger):
"""
This function closes an open handler
:param logger:
:return:
"""
for handler in list(logger.handlers):
handler.close()
logger.removeHandler(handler)
71