0% found this document useful (0 votes)
82 views24 pages

Bike Sharing Data Analysis

Lakshmi Naryan analyzed bike sharing data from January 2011 to March 2012 to understand factors that influence bike rentals. Descriptive statistics and contingency tables were used to analyze relationships between bike rentals and season, month, weather, and weekday. Correlation analysis found moderate positive correlations between bike rentals and temperature/feel temperature. Scatterplots and boxplots visualized these relationships, showing higher rentals in summer months and on warmer, non-holiday weekdays.

Uploaded by

Archit Pateria
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
82 views24 pages

Bike Sharing Data Analysis

Lakshmi Naryan analyzed bike sharing data from January 2011 to March 2012 to understand factors that influence bike rentals. Descriptive statistics and contingency tables were used to analyze relationships between bike rentals and season, month, weather, and weekday. Correlation analysis found moderate positive correlations between bike rentals and temperature/feel temperature. Scatterplots and boxplots visualized these relationships, showing higher rentals in summer months and on warmer, non-holiday weekdays.

Uploaded by

Archit Pateria
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 24

Bike Sharing Data Analysis

Lakshmi Naryan
31 January 2018

#read dataset and visualize length and breadth of the dataset


setwd("F:/Naren/Educational/DataScience/Intern/FinalP")
bk_sh<- read.csv(paste("day.csv", sep=""))
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.4.3

## corrplot 0.84 loaded

library(stats)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.2

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

library(corrgram)

## Warning: package 'corrgram' was built under R version 3.4.3

library(car)

## Warning: package 'car' was built under R version 3.4.3

##
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':


##
## recode
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.2

View(bk_sh)
dim(bk_sh)

## [1] 731 18

#descriptive statistics
summary(bk_sh)

## instant dteday season yr


## Min. : 1.0 01-01-2011: 1 Min. :1.000 Min. :0.0000
## 1st Qu.:183.5 01-01-2012: 1 1st Qu.:2.000 1st Qu.:0.0000
## Median :366.0 01-02-2011: 1 Median :3.000 Median :1.0000
## Mean :366.0 01-02-2012: 1 Mean :2.497 Mean :0.5007
## 3rd Qu.:548.5 01-03-2011: 1 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :731.0 01-03-2012: 1 Max. :4.000 Max. :1.0000
## (Other) :725
## mnth holiday weekday workingday
## Min. : 1.00 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
## Median : 7.00 Median :0.00000 Median :3.000 Median :1.000
## Mean : 6.52 Mean :0.02873 Mean :2.997 Mean :0.684
## 3rd Qu.:10.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :12.00 Max. :1.00000 Max. :6.000 Max. :1.000
##
## weathersit temp atemp hum
## Min. :1.000 Min. : 5.913 Min. : 7.907 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:33.708 1st Qu.:33.784 1st Qu.:52.00
## Median :1.000 Median :49.833 Median :48.673 Median :62.67
## Mean :1.395 Mean :49.538 Mean :47.435 Mean :62.79
## 3rd Qu.:2.000 3rd Qu.:65.542 3rd Qu.:60.860 3rd Qu.:73.02
## Max. :3.000 Max. :86.167 Max. :84.090 Max. :97.25
##
## windspeed casual registered cnt
## Min. : 2.239 Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.:13.495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median :18.098 Median : 713.0 Median :3662 Median :4548
## Mean :19.049 Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:23.321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :50.746 Max. :3410.0 Max. :6946 Max. :8714
##
## X X.1
## Mode:logical Min. :100
## NA's:731 1st Qu.:100
## Median :100
## Mean :100
## 3rd Qu.:100
## Max. :100
## NA's :729
#one way contingency table
mytable <- with(bk_sh, table(season))
mytable

## season
## 1 2 3 4
## 181 184 188 178

mytable <- with(bk_sh, table(mnth))


mytable

## mnth
## 1 2 3 4 5 6 7 8 9 10 11 12
## 62 57 62 60 62 60 62 62 60 62 60 62

mytable <- with(bk_sh, table(workingday))


mytable

## workingday
## 0 1
## 231 500

mytable <- with(bk_sh, table(weathersit))


mytable

## weathersit
## 1 2 3
## 463 247 21

#two way contingency table


mytable <- xtabs(~ workingday+season, data=bk_sh)
mytable

## season
## workingday 1 2 3 4
## 0 61 56 57 57
## 1 120 128 131 121

mytable <- xtabs(~ workingday+mnth, data=bk_sh)


mytable

## mnth
## workingday 1 2 3 4 5 6 7 8 9 10 11 12
## 0 22 18 17 20 19 17 21 16 20 20 20 21
## 1 40 39 45 40 43 43 41 46 40 42 40 41
mytable <- xtabs(~ workingday+weathersit, data=bk_sh)
mytable

## weathersit
## workingday 1 2 3
## 0 156 70 5
## 1 307 177 16

#boxplots
boxplot(bk_sh$cnt ~ bk_sh$season,
data = bk_sh,
main = "Total Bike Rentals Vs Season",
xlab = "Season",
ylab = "Total Bike Rentals",
col = c("red", "red1", "red2", "red3"))

boxplot(bk_sh$cnt ~ bk_sh$holiday,
data = bk_sh,
main = "Total Bike Rentals Vs Holiday/Working Day",
xlab = "Holiday/Working Day",
ylab = "Total Bike Rentals",
col = c("blue", "blue1", "blue2", "blue3"))
boxplot(bk_sh$cnt ~ bk_sh$weathersit,
data = bk_sh,
main = "Total Bike Rentals Vs Weather Situation",
xlab = "Weather Situation",
ylab = "Total Bike Rentals",
col = c("green", "green1", "green2", "green3"))
boxplot(bk_sh$cnt ~ bk_sh$mnth,
data = bk_sh,
main = "Total Bike Rentals Vs Month",
xlab = "Month",
ylab = "Total Bike Rentals",
col = c("yellow"))
boxplot(bk_sh$cnt ~ bk_sh$weekday,
data = bk_sh,
main = "Total Bike Rentals Vs Day of Week",
xlab = "Day of Week",
ylab = "Total Bike Rentals",
col = c("black"))
#histograms
hist(bk_sh$cnt, breaks = 25,
ylab = 'Frequency of Rental', xlab = 'Total Bike Rental Count',
main = 'Distribution of Total Bike Rental Count', col = 'blue' )
hist(bk_sh$windspeed, main="Histogram for Wind Speed",
xlab="wind speed", col = "red")

hist(bk_sh$temp, main="Histogram for Temperature",


xlab="temperature", col = "green")
hist(bk_sh$hum, main="Histogram for Humidity",
xlab="temperature", col = "grey")

#plots
plot(bk_sh$temp, bk_sh$cnt ,
type = 'h', col= 'red', xlab = 'Temperature', ylab = 'Total Bike Rentals')
plot(bk_sh$atemp, bk_sh$cnt ,
type = 'h', col= 'blue', xlab = 'Feel Temperature', ylab = 'Total Bike Rentals'
)
plot(bk_sh$windspeed, bk_sh$cnt ,
type = 'h', col= 'green', xlab = 'Windspeed', ylab = 'Total Bike Rentals')

plot(bk_sh$hum, bk_sh$cnt ,
type = 'h', col= 'black', xlab = 'Humidity', ylab = 'Total Bike Rentals')
ggplot (bk_sh, aes( x= temp, y = cnt, colour = cnt))+geom_point()+geom_smooth()+xlab
("Temperature") + ylab ("Total Count")+ggtitle("Total Count of Bikes used depending
on Temperature")

## `geom_smooth()` using method = 'loess'

#correlation
Cor_temp<-cor(x = bk_sh$temp, y = bk_sh$cnt)
Cor_feel_temp <- cor(x = bk_sh$atemp, y =bk_sh$cnt)

bk_sh_dy_cor<- bk_sh %>% select (cnt,temp,atemp,hum,windspeed)


bk_sh_dy_cor<- data.frame(bk_sh_dy_cor)

colnames(bk_sh_dy_cor)[1] <- "Total Number of Bike Rentals"


colnames(bk_sh_dy_cor)[2] <- "Temperature"
colnames(bk_sh_dy_cor)[3] <- "Feel Temperature"
colnames(bk_sh_dy_cor)[4] <- "Humidity"
colnames(bk_sh_dy_cor)[5] <- "Windspeed"

cor(bk_sh_dy_cor)
## Total Number of Bike Rentals Temperature
## Total Number of Bike Rentals 1.0000000 0.6274940
## Temperature 0.6274940 1.0000000
## Feel Temperature 0.6310657 0.9917016
## Humidity -0.1006586 0.1269629
## Windspeed -0.2345450 -0.1579441
## Feel Temperature Humidity Windspeed
## Total Number of Bike Rentals 0.6310657 -0.1006586 -0.2345450
## Temperature 0.9917016 0.1269629 -0.1579441
## Feel Temperature 1.0000000 0.1399881 -0.1836430
## Humidity 0.1399881 1.0000000 -0.2484891
## Windspeed -0.1836430 -0.2484891 1.0000000

corplot_bk_sh <- cor(bk_sh_dy_cor)


corrplot(corplot_bk_sh, method="number")

#correlogram
bk<- subset(bk_sh,select = c(cnt,temp,atemp,hum,windspeed,weathersit,workingday,seas
on))
cor(bk)
## cnt temp atemp hum windspeed
## cnt 1.00000000 0.62749401 0.63106570 -0.10065856 -0.23454500
## temp 0.62749401 1.00000000 0.99170155 0.12696294 -0.15794412
## atemp 0.63106570 0.99170155 1.00000000 0.13998806 -0.18364297
## hum -0.10065856 0.12696294 0.13998806 1.00000000 -0.24848910
## windspeed -0.23454500 -0.15794412 -0.18364297 -0.24848910 1.00000000
## weathersit -0.29739124 -0.12060224 -0.12158335 0.59104460 0.03951106
## workingday 0.06115606 0.05265981 0.05218228 0.02432705 -0.01879649
## season 0.40610037 0.33431486 0.34287561 0.20544476 -0.22904634
## weathersit workingday season
## cnt -0.29739124 0.06115606 0.40610037
## temp -0.12060224 0.05265981 0.33431486
## atemp -0.12158335 0.05218228 0.34287561
## hum 0.59104460 0.02432705 0.20544476
## windspeed 0.03951106 -0.01879649 -0.22904634
## weathersit 1.00000000 0.06120043 0.01921103
## workingday 0.06120043 1.00000000 0.01248496
## season 0.01921103 0.01248496 1.00000000

corrgram(bk, order=TRUE, lower.panel=panel.shade,


upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Data")

#scatterplot matrix
scatterplotMatrix(formula = ~ bk_sh$weathersit + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Weather Situation on Bike Rentals" )
scatterplotMatrix(formula = ~ bk_sh$workingday + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Working Day/Holiday on Bike Rentals"
)
scatterplotMatrix(formula = ~ bk_sh$season + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Season on Bike Rentals" )

scatterplotMatrix(formula = ~ bk_sh$windspeed + bk_sh$cnt, cex=0.6,


data=bk_sh, main = "Effect of Windspeed on Bike Rentals" )
scatterplotMatrix(formula = ~ bk_sh$temp + bk_sh$cnt, cex=0.6,
data=bk_sh, main = "Effect of Temperature on Bike Rentals" )

scatterplotMatrix(formula = ~ bk_sh$hum + bk_sh$cnt, cex=0.6,


data=bk_sh, main = "Effect of Humidity on Bike Rentals" )
#correlation tests
cor.test(x=bk_sh$cnt,y=bk_sh$workingday)

##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$workingday
## t = 1.6543, df = 729, p-value = 0.09849
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01140813 0.13307950
## sample estimates:
## cor
## 0.06115606

cor.test(x=bk_sh$cnt,y=bk_sh$temp)

##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$temp
## t = 21.759, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5814369 0.6695422
## sample estimates:
## cor
## 0.627494

cor.test(x=bk_sh$cnt,y=bk_sh$weathersit)

##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$weathersit
## t = -8.4101, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3620963 -0.2298340
## sample estimates:
## cor
## -0.2973912

cor.test(x=bk_sh$cnt,y=bk_sh$season)
##
## Pearson's product-moment correlation
##
## data: bk_sh$cnt and bk_sh$season
## t = 11.999, df = 729, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3437082 0.4649230
## sample estimates:
## cor
## 0.4061004

#t tests
t.test(bk_sh$cnt~bk_sh$workingday)

##
## Welch Two Sample t-test
##
## data: bk_sh$cnt by bk_sh$workingday
## t = -1.6014, df = 413.94, p-value = 0.1101
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -567.23982 57.93748
## sample estimates:
## mean in group 0 mean in group 1
## 4330.169 4584.820

t.test(x = bk_sh$cnt, y = bk_sh$temp, alternative = "two.sided")

##
## Welch Two Sample t-test
##
## data: bk_sh$cnt and bk_sh$temp
## t = 62.172, df = 730.13, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4314.139 4595.482
## sample estimates:
## mean of x mean of y
## 4504.34884 49.53848

t.test(x = bk_sh$cnt, y = bk_sh$weathersit, alternative = "two.sided")


##
## Welch Two Sample t-test
##
## data: bk_sh$cnt and bk_sh$weathersit
## t = 62.846, df = 730, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4362.288 4643.619
## sample estimates:
## mean of x mean of y
## 4504.348837 1.395349

t.test(x = bk_sh$cnt, y = bk_sh$season, alternative = "two.sided")

##
## Welch Two Sample t-test
##
## data: bk_sh$cnt and bk_sh$season
## t = 62.831, df = 730, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4361.187 4642.518
## sample estimates:
## mean of x mean of y
## 4504.34884 2.49658

#linear regression models


bk_lm<-lm(formula = cnt~workingday+temp+weathersit+season,data = bk_sh)
summary(bk_lm)

##
## Call:
## lm(formula = cnt ~ workingday + temp + weathersit + season, data = bk_sh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3955.1 -1044.1 -182.1 1067.6 4500.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1835.034 226.318 8.108 2.18e-15 ***
## workingday 190.856 110.188 1.732 0.0837 .
## temp 54.653 2.993 18.260 < 2e-16 ***
## weathersit -862.058 94.830 -9.091 < 2e-16 ***
## season 414.259 48.884 8.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1380 on 726 degrees of freedom
## Multiple R-squared: 0.4954, Adjusted R-squared: 0.4927
## F-statistic: 178.2 on 4 and 726 DF, p-value: < 2.2e-16
#anova tests
bk_an<-anova(bk_lm)
bk_an

## Analysis of Variance Table


##
## Response: cnt
## Df Sum Sq Mean Sq F value Pr(>F)
## workingday 1 10246038 10246038 5.3814 0.02063 *
## temp 1 1070613634 1070613634 562.3054 < 2e-16 ***
## weathersit 1 139661375 139661375 73.3527 < 2e-16 ***
## season 1 136730846 136730846 71.8135 < 2e-16 ***
## Residuals 726 1382283500 1903972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#regression model plot


plot(bk_lm,col = "gold", main = "Linear Regression: Bike Rentals, WoekingDay, Temp,
Weather Situation and Season")

You might also like