Bike Sharing Data Analysis
Bike Sharing Data Analysis
Lakshmi Naryan
31 January 2018
library(stats)
library(dplyr)
##
## Attaching package: 'dplyr'
library(corrgram)
library(car)
##
## Attaching package: 'car'
View(bk_sh)
dim(bk_sh)
## [1] 731 18
#descriptive statistics
summary(bk_sh)
## season
## 1 2 3 4
## 181 184 188 178
## mnth
## 1 2 3 4 5 6 7 8 9 10 11 12
## 62 57 62 60 62 60 62 62 60 62 60 62
## workingday
## 0 1
## 231 500
## weathersit
## 1 2 3
## 463 247 21
## season
## workingday 1 2 3 4
## 0 61 56 57 57
## 1 120 128 131 121
## mnth
## workingday 1 2 3 4 5 6 7 8 9 10 11 12
## 0 22 18 17 20 19 17 21 16 20 20 20 21
## 1 40 39 45 40 43 43 41 46 40 42 40 41
mytable <- xtabs(~ workingday+weathersit, data=bk_sh)
mytable
## weathersit
## workingday 1 2 3
## 0 156 70 5
## 1 307 177 16
#boxplots
boxplot(bk_sh$cnt ~ bk_sh$season,
data = bk_sh,
main = "Total Bike Rentals Vs Season",
xlab = "Season",
ylab = "Total Bike Rentals",
col = c("red", "red1", "red2", "red3"))
boxplot(bk_sh$cnt ~ bk_sh$holiday,
data = bk_sh,
main = "Total Bike Rentals Vs Holiday/Working Day",
xlab = "Holiday/Working Day",
ylab = "Total Bike Rentals",
col = c("blue", "blue1", "blue2", "blue3"))
boxplot(bk_sh$cnt ~ bk_sh$weathersit,
data = bk_sh,
main = "Total Bike Rentals Vs Weather Situation",
xlab = "Weather Situation",
ylab = "Total Bike Rentals",
col = c("green", "green1", "green2", "green3"))
boxplot(bk_sh$cnt ~ bk_sh$mnth,
data = bk_sh,
main = "Total Bike Rentals Vs Month",
xlab = "Month",
ylab = "Total Bike Rentals",
col = c("yellow"))
boxplot(bk_sh$cnt ~ bk_sh$weekday,
data = bk_sh,
main = "Total Bike Rentals Vs Day of Week",
xlab = "Day of Week",
ylab = "Total Bike Rentals",
col = c("black"))
#histograms
hist(bk_sh$cnt, breaks = 25,
ylab = 'Frequency of Rental', xlab = 'Total Bike Rental Count',
main = 'Distribution of Total Bike Rental Count', col = 'blue' )
hist(bk_sh$windspeed, main="Histogram for Wind Speed",
xlab="wind speed", col = "red")
#plots
plot(bk_sh$temp, bk_sh$cnt ,
type = 'h', col= 'red', xlab = 'Temperature', ylab = 'Total Bike Rentals')
plot(bk_sh$atemp, bk_sh$cnt ,
type = 'h', col= 'blue', xlab = 'Feel Temperature', ylab = 'Total Bike Rentals'
)
plot(bk_sh$windspeed, bk_sh$cnt ,
type = 'h', col= 'green', xlab = 'Windspeed', ylab = 'Total Bike Rentals')
plot(bk_sh$hum, bk_sh$cnt ,
type = 'h', col= 'black', xlab = 'Humidity', ylab = 'Total Bike Rentals')
ggplot (bk_sh, aes( x= temp, y = cnt, colour = cnt))+geom_point()+geom_smooth()+xlab
("Temperature") + ylab ("Total Count")+ggtitle("Total Count of Bikes used depending
on Temperature")
#correlation
Cor_temp<-cor(x = bk_sh$temp, y = bk_sh$cnt)
Cor_feel_temp <- cor(x = bk_sh$atemp, y =bk_sh$cnt)
cor(bk_sh_dy_cor)
## Total Number of Bike Rentals Temperature
## Total Number of Bike Rentals 1.0000000 0.6274940
## Temperature 0.6274940 1.0000000
## Feel Temperature 0.6310657 0.9917016
## Humidity -0.1006586 0.1269629
## Windspeed -0.2345450 -0.1579441
## Feel Temperature Humidity Windspeed
## Total Number of Bike Rentals 0.6310657 -0.1006586 -0.2345450
## Temperature 0.9917016 0.1269629 -0.1579441
## Feel Temperature 1.0000000 0.1399881 -0.1836430
## Humidity 0.1399881 1.0000000 -0.2484891
## Windspeed -0.1836430 -0.2484891 1.0000000
#correlogram
bk<- subset(bk_sh,select = c(cnt,temp,atemp,hum,windspeed,weathersit,workingday,seas
on))
cor(bk)
## cnt temp atemp hum windspeed
## cnt 1.00000000 0.62749401 0.63106570 -0.10065856 -0.23454500
## temp 0.62749401 1.00000000 0.99170155 0.12696294 -0.15794412
## atemp 0.63106570 0.99170155 1.00000000 0.13998806 -0.18364297
## hum -0.10065856 0.12696294 0.13998806 1.00000000 -0.24848910
## windspeed -0.23454500 -0.15794412 -0.18364297 -0.24848910 1.00000000
## weathersit -0.29739124 -0.12060224 -0.12158335 0.59104460 0.03951106
## workingday 0.06115606 0.05265981 0.05218228 0.02432705 -0.01879649
## season 0.40610037 0.33431486 0.34287561 0.20544476 -0.22904634
## weathersit workingday season
## cnt -0.29739124 0.06115606 0.40610037
## temp -0.12060224 0.05265981 0.33431486
## atemp -0.12158335 0.05218228 0.34287561
## hum 0.59104460 0.02432705 0.20544476
## windspeed 0.03951106 -0.01879649 -0.22904634
## weathersit 1.00000000 0.06120043 0.01921103
## workingday 0.06120043 1.00000000 0.01248496
## season 0.01921103 0.01248496 1.00000000
#scatterplot matrix
scatterplotMatrix(formula = ~ bk_sh$weathersit + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Weather Situation on Bike Rentals" )
scatterplotMatrix(formula = ~ bk_sh$workingday + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Working Day/Holiday on Bike Rentals"
)
scatterplotMatrix(formula = ~ bk_sh$season + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Season on Bike Rentals" )
##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$workingday
## t = 1.6543, df = 729, p-value = 0.09849
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01140813 0.13307950
## sample estimates:
## cor
## 0.06115606
cor.test(x=bk_sh$cnt,y=bk_sh$temp)
##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$temp
## t = 21.759, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5814369 0.6695422
## sample estimates:
## cor
## 0.627494
cor.test(x=bk_sh$cnt,y=bk_sh$weathersit)
##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$weathersit
## t = -8.4101, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3620963 -0.2298340
## sample estimates:
## cor
## -0.2973912
cor.test(x=bk_sh$cnt,y=bk_sh$season)
##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$season
## t = 11.999, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3437082 0.4649230
## sample estimates:
## cor
## 0.4061004
#t tests
t.test(bk_sh$cnt~bk_sh$workingday)
##
## Welch Two Sample t-test
##
## data: bk_sh$cnt by bk_sh$workingday
## t = -1.6014, df = 413.94, p-value = 0.1101
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -567.23982 57.93748
## sample estimates:
## mean in group 0 mean in group 1
## 4330.169 4584.820
##
## Welch Two Sample t-test
##
## data: bk_sh$cnt and bk_sh$temp
## t = 62.172, df = 730.13, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4314.139 4595.482
## sample estimates:
## mean of x mean of y
## 4504.34884 49.53848
##
## Welch Two Sample t-test
##
## data: bk_sh$cnt and bk_sh$season
## t = 62.831, df = 730, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4361.187 4642.518
## sample estimates:
## mean of x mean of y
## 4504.34884 2.49658
##
## Call:
## lm(formula = cnt ~ workingday + temp + weathersit + season, data = bk_sh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3955.1 -1044.1 -182.1 1067.6 4500.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1835.034 226.318 8.108 2.18e-15 ***
## workingday 190.856 110.188 1.732 0.0837 .
## temp 54.653 2.993 18.260 < 2e-16 ***
## weathersit -862.058 94.830 -9.091 < 2e-16 ***
## season 414.259 48.884 8.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1380 on 726 degrees of freedom
## Multiple R-squared: 0.4954, Adjusted R-squared: 0.4927
## F-statistic: 178.2 on 4 and 726 DF, p-value: < 2.2e-16
#anova tests
bk_an<-anova(bk_lm)
bk_an