Amta - Final Exams: Code: # Load The Toyotacorolla - CSV
Amta - Final Exams: Code: # Load The Toyotacorolla - CSV
Code:
df = read.csv(file.choose())
# Missing Value
library(randomForest)
library(missForest)
sapply(df, function(x) sum(is.na(x))) #Used to check and find if the data set has any
missing values
#from the output we understand that, there are no missing values in the given data set.
Henceforth, no need of imputation of values in the data set.
attach(df)
library(caret)
library(e1071)
library(foreach)
library(ggplot2)
library(ISLR)library(ggplot2)
library(ISLR)
i) Scatterplot
Price vs Kilo meter
Code :
# Price Vs KM
plot(Price ~ KM)
Interpretation
Price of old cars which travelled for very less KM are priced High whereas the price
of old car which travelled for more KM are priced Low.
Price Vs Age
Code:
# Price vs Age
plot(Price ~ Age_08_04)
Interpretation
As the age of car increases the price of the care decreases and vice versa
ii) Multicollinearity
Vif(df)
# variance inflation function should be less than 3 for all the variables of mtcars , check
for the dataset
# In case of multi colinearilty , If F is significant all independent variables become
insignificant
# A high value means alternate hypothesis
iii) Split Data Set
Code:
library(caret)
split <- createDataPartition(Price, p = .30, list = F)
train = df[-split, ] #training the model
test = df[ split, ] #model validation
mean(train$Price)
mean(test$Price)
mean(Price)
lmtrain = lm(Price ~
Age_08_04+KM+Fuel_Type+HP+Met_Color+Automatic+CC+Doors+Quarterly_Tax+Weig
ht, data = train)
v) Significance
Code:
summary(lmtrain)
# age,fuel type, km,hp,quaterly tax and weight are most significant variables when it
comes to predicting price
vi) Performance Validation
Code:
# 1.6 Performance validation set
# prediction interval for test data based on train model
middle = function(a,b,y) {
val= ifelse( y >= a & y < b, 1,0)
return(val)
}
# R square: train
cor(train$Price, trainPrice)^2
# R square: test
cor(test$Price, testPrice)^2
# RMSE
RMSE(train$Price, trainPrice)
RMSE(test$Price, testPrice)
#lower RMSE , better model.
vii) Stepwise Regression
Code:
lm.fit = lm(Price ~
Age_08_04+KM+Fuel_Type+HP+Met_Color+Automatic+CC+Doors+Quarterly_Tax+Weight, data = df)
step.fit = train(Price ~
Age_08_04+KM+Fuel_Type+HP+Met_Color+Automatic+CC+Doors+Quarterly_Tax+Weight, data = df,
method = "lmStepAIC", trace = FALSE)
summary(step.fit)
AIC(lm.fit)
AIC(step.fit$finalModel)
rpart.plot(prunedtree)
# pre pruning
data = df2,
method = "rpart2",
trControl = ctrl.cv,
tree.fit
summary(tree.fit)
plot(tree.fit$finalModel)
rpart.plot(tree.fit$finalModel)
text(tree.fit$finalModel)
tree.fit$finalModel
# RMSE is low at MAXDEPTH – 6
plot(tree.fit$finalModel)
rpart.plot(tree.fit$finalModel)
2.2 Cluster Analysis
Code :
library(ISLR)
library(cluster)
library(caret)
library(leaps)
str(df2)
gower.mat = as.matrix(gowerdist)
class(gower.mat)
pamsol
pamsol$medoids
pamsol$clustering
head(df2)
SIL = NULL
for(i in 2:7)
plot(SIL,type = "l")
pamsol$medoids
head(df)
summary(model)
varImp(model)
Clusters obtained are “58”,”263”