DSBAProject Oct 2020
DSBAProject Oct 2020
Stephen
10/30/2020
---
```{r}
setwd("C:/Users/steph/Documents/DSBA")
getwd()
commute=read.csv("Cars-dataset.csv")
```
Key Assumptions:
library(ggplot2)
library(caTools)
library(corrplot)
library(gridExtra)
library(psych)
library(dplyr)
library(ROCR)
library(janitor)
```
names(commute)
names(commute)
View(commute)
head(commute)
tail(commute)
str(commute)
```
#Work.Exp should be Numeric and Age too. Engineer, MBA, and license are class
variables and should be factors
commute$age<-as.numeric(commute$age)
commute$work_exp<-as.numeric(commute$work_exp)
commute$engineer<-as.factor(commute$engineer)
commute$mba<-as.factor(commute$mba)
commute$license<-as.factor(commute$license)
commute$gender<-as.factor(commute$gender)
commute$transport<-as.factor(commute$transport)
str(commute)
```
```{r}
summary(commute)
```
The average age of the employees is 27 years. The data is well distributed
with the youngest staff being 18 years old, while the oldest is 43 years old.
The 109 of the employees have MBA, 313 have a degree, while only 85 of them
have license.
The work experience is 5.8 years. Again, this data set does not suggest there
is an outlier as the mean and median are very close. There are employees in
the organisation withput any work experience.
Average annual salary is 15,418. There are outliers given that the mean are
median are different. The highest maximum earner collects 57,000.
Average distance of employees to the office is 11.29km. The employee with the
shortest distance covers 3.2 km.
In terms of how they transport themselves to the office, 300 of them (71%)
take public Transport to work. Only 35 (8%) drive a car to work.
```{r}
#check for missing values
sum(is.na(commute))
#As seen in the summary, and also here above, there is a missing value.
#Since we have a fairly large data and only one entry is missing, we could
simply delete the row with the missing value.
commute.m<-na.omit(commute)
summary(commute.m)
commute.m$transport=as.character(commute.m$transport)
commute.m$transport[commute.m$transport=="2Wheeler"|
commute.m$transport=="Public Transport"]="Not_car"
commute.m$transport[commute.m$transport=="Car"]="car"
table(commute.m$transport)
```
```{r}
Plotting Charts
hist(age)
boxplot(commute.m[,c(1,5,6,7)],horizontal=TRUE)
```
```{r}
#Histogram
#barplot
```
```{r}
#Multivariate
qplot(salary,fill=transport, geom="density",alpha=I(.8),main="Mode of
Transport by Salary Earned", xlab="Transport", ylab="Density")
plot(commute.m[,c(5:7)])
```
```{r}
#percentile Distribution for the treatment of outliers
#define a function
outlier_treatment_fun = function(commute.m,var_name){
capping=as.vector(quantile(commute.m[,var_name],0.99))
flooring=as.vector(quantile(commute.m[,var_name],0.01))
commute.m[,var_name][which(commute.m[,var_name]<flooring)]=flooring
commute.m[,var_name][which(commute.m[,var_name]>capping)]=capping
return(commute.m)}
new_vars=c('age','work_exp','salary','distance')
for(i in new_vars){
commute.m=outlier_treatment_fun(commute.m,i)
corrplot(correlations)
```
```{r}
ncol(commute.m)
names(commute.m)
train.matrix=data.matrix(commute.m)
corr_mat=cor(train.matrix[,-9])
library(usdm)
```
```{r}
str(commute.m)
filter.commute.m=commute.m[,-5:-6]
head(filter.commute.m)
filter.commute.m$transport=as.factor(filter.commute.m$transport)
```
```{r}
#modelling
set.seed(1234)
sample=sample.split(filter.commute.m$transport,SplitRatio = 0.75)
train=subset(filter.commute.m,sample==T)
test=subset(filter.commute.m,sample==F)
table(filter.commute.m$transport)
str(train)
dim(train)
dim(test)
```
```{r}
#KNN
library(class)
knn(train[,c(1,5)],test[,c(1,5)],train$transport,k=5)
#Convert to a model
KNN_transport=knn(train[,c(1,5)],test[,c(1,5)],train$transport,k=5)#It is
good to have odd numbers of neighbours(k)to avoid a tie situation.
#scale the data to see if the performance of the model will be better
KNN_transport2=knn(scale(train[,c(1,5)]),scale(test[,c(1,5)]),train$transport
,k=5)
table(test$transport,KNN_transport2)
```
```{r}
#In its simple form, the NB runs on data with all categorical predictor
variables.
library(e1071)
naiveBayes(transport~.,train)
#To predict, I need to create a Naive Bayes object. In order to apply this in
a situation like this where we have continuous variables as predictors, we
assume normality of the data.
NB_transport=naiveBayes(transport~.,train)
predict(NB_transport,type="raw",newdata=train)
plot(predict(NB_transport,type="raw",newdata=train)[,2])
```
```{r}
#Logistic Regression
train$transport=as.factor(train$transport)
str(train)
glm(transport~.,data=train,family=binomial)
model=glm(transport~.,data=train,family=binomial)
summary(model)
```
```{r}
model$fitted.values
plot(train$transport,model$fitted.values)
table(train$transport,transport.predicted)#Confusion Matrix
#ROC Curve
library(pROC)
plot.roc(train$transport,model$fitted.values)
```
```{r}
glm(transport~.,data=test,family=binomial)
model2=glm(transport~.,data=test,family=binomial)
model2
plot(test$transport,model2$fitted.values)
table(test$transport,transport.predicted)#Confusion Matrix
#ROC Curve
roc(test$transport,model2$fitted.values)#At 100, ROC is outstanding
plot.roc(test$transport,model2$fitted.values)
```
```{r}
library(pscl)
```
```{r}
summary(model)
```
```{r}
library(rattle)
install.packages('xgboost')
library(ipred)
install.packages("adabag")
library(adabag)
```
```{r}
allowParallel = TRUE,
classProbs = TRUE,
summaryFunction=twoClassSummary)
```
```{r}
train$transport=as.character(train$transport)
method = "gbm",
trControl = commutectrl,
verbose = FALSE)
```
# Predict using the trained model & check performance on test set
```{r}
confusionMatrix(GBM_pred_test, test$transport)
```
```{r}
method = "rf",
ntree = 30,
maxdepth = 5,
tuneLength = 10,
trControl = commutectrl)
```
# Predict using the trained model & check performance on test set
```{r}
confusionMatrix(Rf_pred_test, test)
```
COMPARING MODELS
```{r}
summary(compare)
```
```{r}
bwplot(compare, scales=scales)
```
Conclusion:
- The models developed performed well on both Train and test data.
- The most significant predictor behind their decisions include Age, and
distance to be covered.