# Author: Shovan Chowdhury
# Reading The data set
d<-[Link]("D:/business analytics/eMDP_BA/EDA_session/RG case/[Link]",header=T)
attach(d)
names(d)
## check the type of all the columns of the dataframe
str(appcab)
##---------------Data Cleaning and Data
Validation-------------------###
## Look for duplicate values in [Link] column as it is the
primary key##
sum(duplicated(d)) ## In the data frame
sum(duplicated(business_id)) ## No duplicate entries found in the primary key
## Look for NA values and missing/blank values in all the columns
[Link](d)
sum([Link](d))
[Link](d)
## Checking for blank values in all the columns of the dataframe
sapply(d, function(x) length(which(x == ""))) # checking for blank "" values; there
are none
#------------exporting results in a text file----------
sink("D:/business analytics/eMDP_BA/EDA_session3/RG case/[Link]")
summary(pageviews[treatment==0])
summary(pageviews[restaurant_type=="chain"])
summary(pageviews[treatment==2 & restaurant_type=="chain"])
sink()
jpeg("D:/business analytics/eMDP_BA/EDA_session3/RG case/[Link]")
par(mfrow=c(1,3))
hist(pageviews[treatment==0])
hist(pageviews[treatment==1])
hist(pageviews[treatment==2])
[Link]()
#---------------- Bar Plots with respect to treatments-----------------
tab_1=tapply(pageviews,treatment,"mean")
tab_2=tapply(calls,treatment,"mean")
tab_3=tapply(reservations,treatment,"mean")
#barplot(tab_1,col=c("red","blue","green"),xlab="Page Views")
barplot(tab_1,col=c("red","blue","green"),xlab="Page
Views",[Link]=c("Control","Treatment 1","Treatment 2"))
barplot(tab_2,col=c("red","blue","green"),xlab="Calls",[Link]=c("Control","Treat
ment 1","Treatment 2"))
barplot(tab_3,col=c("red","blue","green"),xlab="Reservations",[Link]=c("Control"
,"Treatment 1","Treatment 2"))
# Bar Plots with respect to treatments and restaurant type
tab_4=tapply(pageviews,list(treatment,restaurant_type),"mean")
tab_5=tapply(calls,list(treatment,restaurant_type),"mean")
tab_6=tapply(reservations,list(treatment,restaurant_type),"mean")
barplot(tab_4,beside=T,col=c("red","blue","green","red","blue","green"),xlab="Page
Views")
barplot(tab_5,beside=T,col=c("red","blue","green","red","blue","green"),xlab="Calls
")
barplot(tab_6,beside=T,col=c("red","blue","green","red","blue","green"),xlab="Reser
vations")
#-------confirmatory analysis---------------------
# ANOVA
TRT=[Link](treatment)
RT=[Link](restaurant_type)
# One-Way ANOVA Model
mod_1=aov(pageviews~TRT)
summary(mod_1)
#TukeyHSD(mod_1)
mod_2=aov(calls~TRT)
summary(mod_2)
#TukeyHSD(mod_2)
mod_3=aov(reservations~TRT)
summary(mod_3)
#TukeyHSD(mod_3)
# Two-Way ANOVA Model
mod_4=aov(pageviews~TRT*RT)
summary(mod_4)
#TukeyHSD(mod_4)
mod_5=aov(calls~TRT*RT)
summary(mod_5)
#TukeyHSD(mod_5)
mod_6=aov(reservations~TRT*RT)
summary(mod_6)
#TukeyHSD(mod_6)
#-----binomial distribution----------
dbinom(4, size=4, prob=0.2) # P(X=4)
#P(X>=2)
s=0
for(i in 2:4)
s=s+dbinom(i,4,0.2)
# can use CDF
1-pbinom(1,4,0.2)
#--------Poisson distribution---------
dpois(5,lambda=3)
ppois(10,3)
#-------Normal/Gaussian distribution-----
pnorm(20, mean=12, sd=3.2, [Link]=FALSE) # right tail area
pnorm(16, mean=12, sd=3.2, [Link]=TRUE) # left tail area
qnorm(0.9, 12, 3.2) # inverse (to obtain quantile/90th percentile
#-----Normality Check------------------
# Q-Q Plot
qqnorm(reservations)
qqline(reservations, col = "red",lwd=3)
# Formal Test
# Shapiro-Wilk normality test (maximum sample size should be 5000) best test
[Link](reservations)
#Anderson-Darling test
library(nortest)
[Link](reservations)
[Link](calls[treatment==0 & restaurant_type=="chain"])
#------------mean test one sample------------
[Link](calls, alternative = "greater", mu = 35)
[Link](calls, mu = 35)
#---------------two samples test----------------
[Link](calls, reservations, mu = 0, [Link] = 0.95)