Python Project Submission by - Ravikanth Govindu: Due Date: 27th Mar 2022
Python Project Submission by - Ravikanth Govindu: Due Date: 27th Mar 2022
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set( color_codes =True)
m% atplotlib inline
from warnings import filterwarnings
filterawrnings( "ignore")
pd.set_option( 'display.float_format', lambda x: '%.2f' % x)
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import ttest_lsamp, ttest_rel, ttest_ind, chi2_contingency, shapiro
In [2]:
dfl = pd. read_csv( "Wholesale+Customers+Data.csv")
In [7]:
dfl.head()
Out[7]: Buyer/Spender Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicatessen
In [8]: dfl.tail()
Out[8]: Buyer/Spender Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicatessen
435 436 Hotel Other 29703 12051 16027 13135 182 2204
436 437 Hotel Other 39228 1431 764 4510 93 2346
437 438 Retail Other 14531 15488 30243 437 14841 1867
438 439 Hotel Other 10290 1981 2232 1038 168 2125
439 440 Hotel Other 2787 1698 2510 65 477 52
In [10]:
df1.shape
(440, 9)
Out[10]:
In [11]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
Rangelndex: 440 entries, 0 to 439
Data columns (total 9 columns):
# Column Non-Null Count Dtype
In [13]:
df l. de s c r i be ( i nc l ude = " all " )
Out[13]: Buyer/Spender Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicatessen
count 440.00 440 440 440.00 440.00 440.00 440.00 440.00 440.00
top NaN Hotel Other NaN NaN NaN NaN NaN NaN
freq NaN 298 316 NaN NaN NaN NaN NaN NaN
mean 220.50 NaN NaN 12000.30 5796.27 7951.28 3071.93 2881.49 1524.87
std 127.16 NaN NaN 12647.33 7380.38 9503.16 4854.67 4767.85 2820.11
min 1.00 NaN NaN 3.00 55.00 3.00 25.00 3.00 3.00
25% 110.75 NaN NaN 3127.75 1533.00 2153.00 742.25 256.75 408.25
50% 220.50 NaN NaN 8504.00 3627.00 4755.50 1526.00 816.50 965.50
75% 330.25 NaN NaN 16933.75 7190.25 10655.75 3554.25 3922.00 1820.25
max 440.00 NaN NaN 112151.00 73498.00 92780.00 60869.00 40827.00 47943.00
In [16]:
df l . i s nul l ( ) . s mu ( )
Buyer/Spender 0
Out[16]: Channel 0
Region 0
Fresh 0
Milk 0
Grocery 0
Frozen 0
Detergents_Paper 0
Delicatessen 0
dtype: int64
In [18]: df1.columns
Out[4]: Region
Other 10677599
Lisbon 2386813
Oporto 1555088
Name: Total, dtype: int64
In [51]:
plt.figure(figsize=( 10,5));
x=df1.groupby( 'Region').agg({'Total':sum}) . sort_values(
by="Total",ascending=False) sns.barplot( x='Region" , y='Total',data
=x.reset_index(),ci=False)
1.0
0.8
]i 0.6
i2
0.4
0..2
0.0
In [52]:
dfl.groupby( "Channel")["Total"].sum() .sort_value(sascending =False)
Out[52]: Channel
Hotel 7999569
Retail 6619931
Name: Total, dtype: int64
In [53]:
plt.figure(figsize=( 10,5));
x=dfl.groupby( 'Channel',) agg({'Total':sum}) .sort_values( by="Total",ascending=False)
sns.barplot( x='Channel',y='Total',data =x.reset_index() ,ci=Fa
0
Hote l Reta il
O umn el
In [12]:
dfl.groupby([ "Region","Channel"])["Total"].sum() . sort_values( ascend ing=Fa lse)
Region Channel
Out[12]: Other Hotel 5742077
Retail 4935522
Lisbon Hotel 1538342
Retail 848471
Oporto Retail 835938
Hotel 719150
In [13]:
dfl.groupby([ "Region","Channel"]).sum()
Region Channel
Lisbon Hotel 14026 761233 228342 237542 184512 56081 70632 1538342
Retail 4069 93600 194112 332495 46514 148055 33695 848471
Buyer/Spender Fresh Milk Grocery Frozen Detergents_Paper Delicatessen Total
Region Channel
Oporto Hotel 8988 326215 64519 123074 160861 13516 30965 719150
Other Hotel 48020 2928269 735753 820101 771606 165990 320358 5742077
In [65]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Fresh", hue ="Region", ki nd ="bar " , ci =None , data=dfl)
plt.xlabel( "Channel", size=l5)
plt.ylabel( "Total Spendin g",
size=l5) plt.title('Item - Fr
esh',size=18)
1200
0
Ol 1000
,5 0
"IOI
C: 8000 Region
,:])
Cl.
- aher
00
- Usbori
6000
- Cporot
4000
2000
0
Retai
Hotel
l
Channe
l
In [67]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Fresh", kind =" bar" , ci =None , data=dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Fr esh',size=18)
12000
10000
Ol
,5
"IOI
C: 0000
,:])
Cl.
00
6000
4000
2000
0
Retail Hatel
Cllan11el
In [68]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Region", y="Fre sh", kind =" bar ", ci =None, data=dfl)
plt.xlabel( "Region", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Fr esh',size=18)
·12000
10000
Ol
,5 0000
"IOI
C:
,:])
Cl.
00 6000
4000
2000
0
a11er Lisbofl Q:Jorto-
Region
In [69]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Milk", hue ="Region", kind="bar",ci=None, data =dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Milk',size=18)
10000
--
0000
Ol
,5
"IOI
C: Reg ion
-
,:]) 61]00
Cl. aher
00 Usbori
Cporto
4000
2ll00
0
Retail Hotel
Channel
In [70]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Milk", kind="bar",ci=None, data=dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Milk',size=18)
10000
0000
Ol
,5
"IOI
C:
,:]) 0000
Cl.
00
4000
0
Retail Hotel
Channel
In [71]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Region", y="Milk", kind ="bar",ci=None, data=dfl)
plt.xlabel( "Region", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Milk',size=18)
0000
Ol 4000
,5
"IOI
C:
,:])
3000
2!)00
1000
0
ether Lisbon
orlo
Region
In [72]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Grocery", hue ="Region", kind =" bar ", ci =None,
data=dfl) plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Grocery',size=18)
17500
15000
. 12500
"IOI
C: Reg ion
:g_ 10000
- aher
00
- Usbori
7500 - Cporot
2500
0
Retail Hotel
Channel
In [73]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Grocery", kind ="bar ", ci = None, data=dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Grocery',size=18)
14000
12000
Ol
C:
10000
C:
,:])
Cl.
00 MOO
6000
4000
2000
0
Retail Hote l
Channel
In [74]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Region", y="Grocery" , kind ="bar",ci=None, data=dfl)
plt.xlabel( "Region", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Grocery',size=18)
8000
Ol
,5 6000
"IOI
C:
,:])
Cl.
00
4000
2il00
0
ether Lisbon
orlo
Region
In [75]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Frozen", hue ="Region", kind =" bar ", ci =None, data=dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Frozen',size=18)
.'iOOO
Ol 4000
,5
"O
C: Reg ion
,:])
Cl. 3000 - other
00
- Lisbon
- or to
2000
1000
0
Retail Hotel
Channel
In [76]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Frozen", kind =" bar ", ci =None, data=dfl)
plt.xlabel( "Channel", size=l5)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Frozen',size=18)
3500
3000
O l
, 5
2500
"IOI
C:
:g_ 2iJOO
00
1500
1000
500
0 Retail Hotel
Channel
In [77]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Region", y="Frozen", kind =" bar ", ci =None , data=dfl)
plt.xlabel( "Region", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Frozen',size=18)
3300
3000
Ol
C:
2500
C:
,:])
Cl.
00 2!:100
1 50 0
1000
000
0
ether Lisbon
orlo
Region
In [78]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Detergents_Paper", hue ="Region", kind ="bar" ,ci =None,
data=dfl) plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Detergents_Paper' ,size=18)
8000
moo
6000
Ol
,5
5000 Reg ion
,:])
Cl. - Oher
OO 4000 - Lisbon
- orlo
3000
21)00
• -
Ch a
1000
0
Retail Hotel
n11 1e l
In [79]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Detergents_Paper", kind =" bar ", ci = None , data =dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Detergents_Paper' ,size=18)
6000
Ol 0000
,5
"IOI
C:
© 4000
Cl.
00
3000
2000
100 0
0
Retail Hotel
Channel
In [80]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Region", y="Detergent s_Paper" , kind =" bar ", ci = None, data=dfl)
plt.xlabel( "Region", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Detergents_Paper' ,size=18)
3500
3000
Ol 2500
,5
"IOI
C:
:g_ 2!)00
00
1500
1000
000
0
ether Lisbon
orlo
Region
In [81]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Delicatessen", hue ="Regi on", kind =" bar ", ci =None, data=dfl)
plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Delicatessen' ,size=18)
1750
1500
. 1 25 0
"IOI
C: Reg ion
:g_ 1 00 0
- Oher
00
- Lisbon
- orlo
750
500
250
0
Retail Hotel
Ch a n111e l
In [82]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Channel", y="Delicatessen", kind ="bar" , ci = None,
data=dfl) plt.xlabel( "Channel", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Delicatessen' ,size=18)
150
0
Ol
1250
,5
"IOI
53 1000
Cl.
00
750
250
0
Retail Hotel
Channel
In [83]:
plt.figure(figsize=( 12,7));
sns.catplot( x="Region", y="De licatessen", kind ="bar",ci =None, data=dfl)
plt.xlabel( "Region", size=15)
plt.ylabel( "Total Spend ing", size=15)
plt.title('Item - Delicatessen' ,size=18)
In [49]:
plt.figure(figsize=( 20,20))
sns.boxplot( data=dfl.dr op([ "Total","Buyer/Spender" ],axis =l));
plt.title('Box plots for all the items',size=18)
100000
aoooo
♦
♦
♦
♦
♦
60000 ♦
♦ ♦
♦
♦
♦
♦ ♦
♦ ♦ ♦
♦ ♦
40000 •I ♦
♦
♦ ♦
♦ ♦ ♦
♦ ♦
♦
♦
•i •
♦
'• •
♦
♦
I
20000
• I ♦
I ♦
In [92]:
dfl.mean() .sort_values( ascending=False)
Total 33226.14
Out[92]:
Fresh 12000.30
Grocery 7951.28
Milk 5796.27
Frozen 3071.93
Detergents_Paper 2881.49
Delicatessen 1524.87
Buyer/Spender 220.50
dtype: float64
In [94]:
dfl.describe().T
Out [94]: count mean std min 25% 50% 75% max
In [16]:
df2=pd. r ead_c sv ( " Sur vey - 1 . c sv " )
In [17]:
df2. head ( )
0 Female 20 Junior Other Yes 2.90 Full-Time 50.00 3 350 Laptop 200
2 3 Male 21 Junior Other Yes 2.50 Part-Time 45.00 2 4 600 Laptop 200
3 4 Male 21 Junior CIS Yes 2.50 Full-Time 40.00 4 6 600 Laptop 250
4 5 Male 23 Senior Other Undecided 2.80 Unemployed 40.00 2 4 500 Laptop 100
In [102...
df2.tail()
In [103. . .
df2.shape
(62, 14)
Out[103...
In [104...
df2.info()
<class 'pandas.core.frame.DataFrame'>
Rangelndex: 62 entries, 0 to 61
Data columns (total 14 columns):
# Column Non-Null Count Dtype
0 ID 62 non-null int64
1 Gender 62 non-null object
2 Age 62 non-null int64
3 Class 62 non-null object
4 Major 62 non-null object
5 Grad Intention 62 non-null object
6 GPA 62 non-null float64
7 Employment 62 non-null object
8 Salary 62 non-null float64
9 Social Networking 62 non-null int64
10 Satisfaction 62 non-null int64
11 Spending 62 non-null int64
12 Computer 62 non-null object
13 Text Messages 62 non-null int64
dtypes: float64(2), int64(6), object(6)
memory usage: 6.9+ KB
In [105...
df2.isnull().sum()
ID 0
Out[105...
Gender 0
Age 0
Class 0
Major 0
Grad Intention 0
GPA 0
Employment 0
Salary 0
Social Networking 0
Satisfaction 0
Spending 0
Computer 0
Text Messages 0
dtype: int64
In [111........
df2.describe( include="all").T
Out[111... count unique top freq mean std min 25% 50% 75% max
ID 62.00 NaN NaN NaN 31.50 18.04 1.00 16.25 31.50 46.75 62.00
Gender 62 2 Female 33 NaN NaN NaN NaN NaN NaN NaN
Age 62.00 NaN NaN NaN 21.13 1.43 18.00 20.00 21.00 22.00 26.00
Class 62 3 Senior 31 NaN NaN NaN NaN NaN NaN NaN
Grad Intention 62 3 Yes 28 NaN NaN NaN NaN NaN NaN NaN
GPA 62.00 NaN NaN NaN 3.13 0.38 2.30 2.90 3.15 3.40 3.90
Employment 62 3 Part-Time 43 NaN NaN NaN NaN NaN NaN NaN
Salary 62.00 NaN NaN NaN 48.55 12.08 25.00 40.00 50.00 55.00 80.00
Social Networking 62.00 NaN NaN NaN 1.52 0.84 0.00 1.00 1.00 2.00 4.00
count unique top freq mean std min 25% 50% 75% max
Satisfaction 62.00 NaN NaN NaN 3.74 1.21 1.00 3.00 4.00 4.00 6.00
Spending 62.00 NaN NaN NaN 482.02 221.95 100.00 312.50 500.00 600.00 1400.00
Text Messages 62.00 NaN NaN NaN 246.21 214.47 0.00 100.00 200.00 300.00 900.00
In [114...
df 2 . c o l umns
In [116...
# Contingency Table - Gender and Major
pd . c r o s s t a b( df 2[ "Ge nde r " J , df 2[ "Ma j o r " ] )
Out [ 116 ... Major Accounting CIS Economics/Finance International Business Management Other Retailing/Marketing Undecided
Gender
Female 3 3 7 4 4 3 9 0
Male 4 4 2 6 4 5 3
In [117...
# Contingency Table - Gender and Grad Intention
pd . c r os s t a b( df 2[ "Ge nde r " ] , df 2[ " Gr a d I nt e nt i o n" ] )
Gender
Female 9 13 11
Male 3 9 17
#Contingency Table - Gender and Employment
pd. crosstab( df2["Gender"],df2["Employment"])
prin t("The probability that a randomly selected CMSU Student is male is", round( 29/len( df2["Gender"]),4))
prin t("The probability that a randomly selected CMSU Student is female is", round( 33/len( df2["Gender"]),4))
pd.crosstab( df2["Gender"],df2["Major"])
prin t("The conditional probability of Accounting Major among male students is ",round( 4/ 29,4))
prin t("The conditional probability of CIS Major among male students is ",round( l/29,4))
prin t("The conditional probability of Economics/Finance Major among male students is ",round( 4/29,4))
prin t("The conditional probability of International Business Major among male students is ",round(
2/29,4)) prin t("The conditional probability of Management Major among male students is ",round( 6/ 29,4))
prin t("The conditional probability of Other Major among male students is ",round( 4/ 29,4))
prin t("The conditional probability of Retailing/Marketing Major among male students is ",round( S/29,4))
prin t("The conditional probability of among male students who have not decided thier major is ",round(
3/29,4))
prin t("The conditional probability of Accounting Major among female students is ",round( 3/33,4))
prin t("The conditional probability of CIS Major among female students is ",round( 3/33,4))
prin t("The conditional probability of Economics/Finance Major among female students is ",round( 7/33,4))
prin t("The conditional probability of International Business Major among female students is ",round( 4/ 33,4))
prin t("The conditional probability of Management Major among female students is ",round( 4/33,4))
prin t("The conditional probability of Other Major among female students is ",round( 3/33,4))
prin t("The conditional probability of Retailing/Marketing Major among female students is ",round( 9/ 33,4))
prin t("The conditional probability of among female students who have not decided thier major is ",round(
0/33,4))
The conditional probability of International Business Major among female students is 0.1212
The conditional probability of Management Major among female students is 0.1212
The conditional probability of Other Major among female students is 0.0909
The conditional probability of Retailing/Marketing Major among female students is 0.2727
The conditional probability of among female students who have not decided thier major is 0.0
In [133...
pd.crosstab( df2["Gender"],df2["Grad Intention"])
Gender
Female 9 13 11
Male 3 9 17
In
prin t("The probability that a randomly chosen student is a male and intends to graduate is",17/29*29/62)
[134...
The probability that a randomly chosen student is a male and intends to graduate is 0.27419354838709675
Gender
Female 2 29 2
Male 3 26 0
[139...
In
[136...
In
P(not robability that a randomly chosen student is a female and does not have a laptop is 0.06451612903225806 pd.crosstab(
having
laptop n
Female) = df2["Gender"J, df2["Employment"])
P
(Desktop!
female) x
P (female)
+ P
(Tablet!
female) x
P (female)
prin
t(''The
probabi
lity
that a
randoml
y
chosen
student
is a
female
and
does
not
have a
laptop
is",2/3
3*33/62
+
2/33*33
/62)
p
Out[139... Employment Full-Time Part-Time Unemployed
Gender
Female 3 24 6
Male 7 19 3
In [140...
df2[ "Empl oyment"].value_counts()
Out[140.. Part-Time 43
.
Full-Time 10
Unemployed 9
Name: Employment, dtype: int64
P(Full time Employment U Male) = P(Full-Time) + P(Male) - P (Full-time Employment! male) x P (male)
In
[141...
prin t("The probability that a randomly chosen student is a male or has full-time employment is ",10/62 + 29/62 - 7/ 29*29/62)
The probability that a randomly chosen student is a male or has full-time employment is 0.5161290322580645
In
[142... pd. crosstab( df2["Gender"],df2["Major"])
Out[142... Major Accounting CIS Economics/Finance International Business Management Other Retailing/Marketing Undecided
Gender
Female 3 3 7 4 4 3 9 0
Male 4 4 2 6 4 5 3
In [58]: In [173...
prin 3
t("The
conditio
nal pd.crosstab( df2["Gender"J, df2["Grad Intention"J, margins =True)
probabil
ity
that
given a
female
student
is
randomly
chosen,
she is
majoring
in
internat
ional
business
or ma
The
conditio
nal
probabil
ity
that
given a
female
student
is
randomly
chosen,
she is
majoring
in
internat
ional
business
or
manageme
n tis
0.2424
242424
242424
Out[ 173.. Grad Intention No Undecided Yes All
.
Gender
Female 9 13 11 33
Male 3 9 17 29
All 12 22 28 62
In [62]:
pd. crosstab( df2["Gender"J, df2["Grad Intention"J) . drop( "Undecided",axis=l)
Gender
Female 9 11
Male 3 17
This is not independent events as probability multiplication of both events is not equal to combined event, so being a winner and being female
candidate are not independent events.
In
[160... df2["GPA" ].mean()
Out[160... 3.129032258064516
In [161...
df2[ "GPA" ].std()
Out[161. 0.3773883926969118
..
In [162...
stats.norm. cdf(3,loc=df2["GPA"]. mean(), scale=df2["GPA"].std())
Out[162. 0.3662099174094998
..
In prin t(''The probability that if a student is randomly choosen, thier GPA is less than 3 is", stats.norm.
[163...
cdf(3,loc=df2["GPA"].mean() The probability that if a student is randomly choosen, thier GPA is less than 3 is 0.3662099174094998
3 4 Male 21 Junior CIS Yes 2.50 Full-Time 40 .00 4 6 600 Laptop 250
4 5 Male 23 Senior Other Undecided 2.80 Unemployed 40.00 2 4 500 Laptop 100
11 12 Male 21 Senior Undecided No 3.50 Full-Time 37.00 2 3 500 Laptop 100
In [168. . .
df2[df2["Gender"]=="Male"][ "Salary"].std()
10.79317427068786
Out[168
In [169. . .
df2[df2[ "Gender"]=="Male"][ "Salary"]. mean()
48.275862068965516
Out[169
1-stats.norm. cdf(50,loc=df2[df2["Gender"]=="Male"]["Salary" ]. mean(), scale=df2[df2["Gender"]=="Male"]["Salary" ].std())
prin t("The probability that if a male student is randomly choosen, thier GPA is 50 or more is',' 1-stats.norm. cdf(50,
prin t("The probability that if a female student is randomly choosen, thier GPA is 50 or more is", 1-stats.norm.
cdf(50,loc=df2[df2[
# Lets plot the histograms of GPA, Salary, Spending and Text Messages
plt.figure(figsize=( 20,20))
plt.subplot( 2,2,1)
sns.distpl ot(
df2['GPA'],kde=True,color=None)
plt.subplot( 2,2,2)
sns.distplot( df2['Salary'],kde=True)
plt.subplot( 2,2,3)
sns.distpl ot( df2['Spendin g'],kde=True)
plt.subplot( 2,2,4)
sns.distpl ot( df2['Text Messages'],kde=True)
0.015
0.4
0.010
0.2
0.005
0.0 0.000
20 25 3.0 3.5 4.0 4.5 20 40 60 80 100
GPA Salary
0.0025
0.0025
0.0020
0.0020
0.0015
0.0010
0.0010
0.0005
0.0005
0.0000 0.0000
-250 0 250 500 750 1000 1250 1500 1750 -200 0 200 400 600 800 1000 1200
Spending Text Messages
In [197...
# Shapiro tests to check whether GPA, Salary, Spending and Text Messages follow normal Distribution
shapiro(df2["GPA"])
Out[197. ShapiroResult(statistic=0.9685361981391907, pvalue=0.11204058676958084)
..
In shapiro(df2["Salary"])
[199...
ShapiroResult(statistic=0.9565856456756592, pvalue=0.028000956401228905)
Out[199.
..
shapiro(df2["Spending"])
In
ShapiroResult(statistic=0.8777452111244202, pvalue=l.6854661225806922e-05)
[201...
Out[202..
. Problem 3 - ABC Asphalt Shingles
Load the Dataset
In [18]:
df3=pd. read_csv( "A+&+B+shingles.csv")
In [19]:
df3.head()
Out[19]: A B
0 0.44 0.14
1 0.61 0.15
2 0.47 0.31
3 0.30 0.16
df3.shap e
df3.isnull().sum()
df3.describe()
In tstat,pvalue =stats.ttest_lsamp( df3["A"],0.35)
[212...
0.07477633144907513
Out[214.
..
p-value is more than 0.05. We cannot reject Null Hypothesis. We do not have enough evidence to prove the claim that mean values in A Shingles are
not less than 0.35 pounds per 100 square feet at 0.05 significance level.
For B Shingles
HO=> mu<=0.35
Ha=> mu>0.35
In
[215... tstat,pvalue =stats.ttest_lsamp( df3["B"].dr opna(), 0.35)
0.0020904774003191813
Out[ 21
6...
p-value is less than 0.05. We can reject Null Hypothesis. We have enough evidence to prove the claim that mean values in B Shingles are not less than
0.35 pounds per 100 square feet at 0.05 significance level
Next Question
In [217...
Ha = > muA is not equal to muB
In
[218...
tstat,pvalue =stats.ttest_ind( df3["A"],df3["B"].dr opna())
0.2017496571835328
Out[ 21
8...
p-value is more than 0.05. We cannot reject Null Hypothesis. We do not have enough evidence to prove that mean values in A Shingles are not equal
to mean values in B Shingles.
Assumptions:
- That both popluations of A Shingles and B Shingles follow normal distribution and that the variances of both the
normal distributions are same.
In [27]:
df3.plot( kind ="box")
<AxesSubplot:>
Out[27]:
0
0.7
0.6
0.5
0.4
0.3
0..2
0.1
A B
In [ ] :