0% found this document useful (0 votes)
383 views33 pages

Introduction To Statistical Learning R Labs and Exercises Code

This document provides an overview of simple linear regression, multiple linear regression, and other regression techniques using the Boston housing data set. It demonstrates how to fit linear regression models in R, interpret outputs, analyze residuals and leverage, compare models, and handle different predictor variable types like qualitative variables. Exercises at the end apply these techniques to other data sets and explore additional methods like transformations, interactions and adding polynomial terms.

Uploaded by

Matei Stănescu
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
383 views33 pages

Introduction To Statistical Learning R Labs and Exercises Code

This document provides an overview of simple linear regression, multiple linear regression, and other regression techniques using the Boston housing data set. It demonstrates how to fit linear regression models in R, interpret outputs, analyze residuals and leverage, compare models, and handle different predictor variable types like qualitative variables. Exercises at the end apply these techniques to other data sets and explore additional methods like transformations, interactions and adding polynomial terms.

Uploaded by

Matei Stănescu
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 33

####### 2.

0########
#3.6.2 Simple Linear Regression
fix(Boston)
names(Boston)
attach(Boston)
#medv response, lstat predictor
lm.fit=lm(medv~lstat)
#basic information
lm.fit
#detailed information
summary(lm.fit)
#pieces of information stored in lm.fit
names(lm.fit)
terms(lm.fit)
residuals(lm.fit)
plot(residuals(lm.fit))
#confidence intervals
confint(lm.fit)
#predict function used to compute confidence/prediction intervals for a given va
lue of the predictor
predict(lm.fit,data.frame(lstat=c(5,10,15)),interval="confidence")
predict(lm.fit,data.frame(lstat=c(5,10,15)),interval="prediction")# both centere
d on the same value but the latter is wider
#plot the response (medv), predictor(lstat), and regression line
plot(lstat,medv)
abline(lm.fit)
#diagnostic plots
par(mfrow=c(2,2))
plot(lm.fit)
#residuals() returns the residuals while rstudent() returns the studentized resi
duals, which we plot against the fitted values[predict(lm.fit)]
plot(predict(lm.fit),residuals(lm.fit))
plot(predict(lm.fit),rstudent(lm.fit))
#leverage statistics
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
#3.6.3 Multiple Linear Regression
lm.fit=lm(medv~lstat+age,data=Boston)
summary(lm.fit)
lm.fit=lm(medv~.,data=Boston)
summary(lm.fit)
library(car)
#variance inflation factor
vif(lm.fit)
#all predictors except age
lm.fit=lm(medv~.-age,data=Boston)
summary(lm.fit)
#3.6.4 Interaction Terms
#lstat*age=lstat+age+lstat:age <- interaction term
summary(lm(medv~lstat*age,data=Boston))
#3.6.5 Non-Linear Transformation of the Predictors
lm.fit2=lm(medv~lstat+I(lstat^2),data=Boston)
summary(lm.fit2)
#comparing models using anova

anova(lm.fit,lm.fit2)
lm.fit5=lm(medv~poly(lstat,5),data=Boston)
#log transformation of the predictors [ useful for heteroscedascity and other pr
oblems]
summary(lm(medv~log(rm),data=Boston))
#3.6.6 Qualitative Predictors
fix(Carseats)
names(Carseats)
#lm with interaction terms
lm.fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats) #all plus interact
ion terms added
summary(lm.fit)
#contrasts() returns the coding done for dummy vars.
attach(Carseats)
contrasts(ShelveLoc)

################ EXERCISES ##############


attach(Auto)
summary(Auto)
##8
#a
lm.fit=lm(mpg~horsepower,data=Auto)
summary(lm.fit)
#i: Since F statistic is far larger than 1 and the p -value of the F stat. is ve
ry small,we cana reject the null hipot.
#ii: We calculade residual error relative to the response using MEAN and RSE; me
an(mpg)=24.449 and RSE of lm.fit was
# 4.906/24.449*100=20.06%. Rsq = .6 meaning 60% of var is explained by the mod
el
#iii: Rel betweeen mpg and horsepower is negative.
#iv.
predict (lm.fit ,data.frame(horsepower=c(98)),interval ="prediction")
predict (lm.fit ,data.frame(horsepower=c(98)),interval ="confidence")
#b
plot(horsepower,mpg)
abline(lm.fit)
#c
par(mfrow=c(2,2))
plot(lm.fit)
##9
#a
pairs(Auto)
#b
cor(subset(Auto,select=-name))

#c
lm.fit=lm(mpg~.-name,data=Auto)
summary(lm.fit)
#i. yes, F stat is far from one and very small p value
#iii. mpg incereases by the value of the coefficient
#d
plot(lm.fit)
par(mfrow=c(1,1))
plot(predict(lm.fit),rstudent(lm.fit))
#e
lm.fit1=lm(mpg~cylinders*displacement+displacement:weight)
summary(lm.fit1)
#f
lm.fit=lm(mpg~sqrt(weight)+log(displacement)+I(cylinders^2),data=Auto)
summary(lm.fit)
##10
#a
attach(Carseats)
names(Carseats)
summary(Carseats)
fix(Carseats)
lm.fit=lm(Sales~Price+Urban+US,data=Carseats)
summary(lm.fit)
#b
# as price increases , sales decrease . significant , small p value
# the model suggests that there is no relationship between the location of the s
tore and sales
# the fact that the store is located in the US is significant. sales increase by
1200
#c
#sales=13.04-0.05*price-0.02*urbanYES+1.2*USyes
#d
#for price and usyes
#e
lm.fit2=lm(Sales~Price+US,data=Carseats)
summary(lm.fit)
#f
#similar values for R Squared and RSE
#g
confint(lm.fit2)
#h
plot(predict(lm.fit2),rstudent(lm.fit2)) # all betweeen -3 and 3 so no outliers
plot(lm.fit2) #points that exceed (p+1) / n have high leverage
dim(Carseats)
(2+1)/400
##11
#a
set.seed(1)
x=rnorm(100)
y=2*x+rnorm(100)
lm.fit=lm(y~x+0)
summary(lm.fit)
#b

lm.fit1=lm(x~y+0)
summary(lm.fit1)
#c
plot(x,y)
abline(lm.fit)
abline(lm.fit1)
#d
sqrt(length(x)-1)*sum(x*y)/sqrt(sum(x^2)*sum(y^2) - sum(x*y)^2
# t stat shown above
#f
lm.fit1=lm(x~y)
lm.fit2=lm(y~x)
summary(lm.fit1)
summary(lm.fit2)

##15
#a
library(MASS)
attach(Boston)
names(Boston)
lm.fit1=lm(crim~zn,data=Boston)
summary(lm.fit1)
#...
#b
lm.fit.all=lm(crim~.,data=Boston)
summary(lm.fit.all) # zn, age, dis, black, medv
################################################################################
#######################################################
#ala cu boxplots rezumat , apoi lab si exercitii
#Logisitc regression, LDA, QDA, KNN
################################################################################
######################################################
# PAGE 151. !!
#Both LOG REG and QDA both produce linear boundaries. The only difference lies i
n the fact that B0 and B1 are estimated using maximul
#likelyhood, whereasc c0 and c1 are computed using the estimated mean and varia
nce from a normal distribution.
# LDA assumes that the obs. are drawn from a gaussian distribution with a common
covariance matrix in each class and provides improvemen
# ts over log .reg. when this assumption holds. When the gaussian assumptions ar
e not met, log .reg. outperforms LDA.
# On the other hand KNN is expected to dominate log. reg. and LDA when the boun
dary is highly non linear because it is non-parametric
# and makes no assumption about the shape. The KNN doesnt tell which predictors
are important and doesent give a coefficients table.
# QDA serves as an intermediary between LDA+Log. Reg. and KNN because it assum
es a quadratic decision boundary , and therefore is
# more flexible.
# ##############################################################################
#######################################################
#
# #Scenario 1(linear): 20 obs in each class, uncorelated random normal obs., d
ifferent mean in each class.
#
LDA first because it assumes exactly this boundary, Log Reg after bc. i

t is similar. KNN performed poorly bc its


#
increace in variance was not offset by a reduction in bias. QDA perfor
med worse than LDA and LOG but better than KNN bc
#
it assumes a quadratic boundary.
#
# #Scenario 2(linear): same scenario as the first one but with cor. of -0.5 b
etween variables. Same results
#
# #Scenario 3(linear): samples x1 and x2 drawn from the t-distribution with 5
0 obs per class. due to the fact that t distr yields
#
more extreme values, even though the decision boundary
is still linear, it violates the assumptions of
#
the LDA(which assumes obs. are drawn from normal distr
ib.). Log reg best. QDA worst.
#
# #Scenario 4(non-linear): normal distribution, cor 0.5 first group, cor -0.5
second group. this corresponds to QDA which outperforms all
#
# #Scenario 5(non-linear): two classes, uncorrelated predictors, normal distri
butions. the responses were sampled from the log reg
#
using x1^2, x2^2, x1 x x2 as predictors.This corresp
onds to quadratic, which performed best followed
#
by KNN-CV, linear methods having the worst performan
ce.
# #Scenario 6(non-linear): same as previous but sampled from a more complicate
d non linear function.KNN best , followed by QDA and
#
linear methods.
## Using transformations of the predictors, we can create more flexible versions
of these methods : A more flexible version of the
# log regrssion(use x^2,x^3,x^3 as predictors);
################################################################################
######################################################
#4.6 Lab: Logistic Regression, LDA, QDA, and KNN

library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
pairs(Smarket)
cor(Smarket)
cor(Smarket[,-9])
attach(Smarket)
plot(Volume)
#4.6.2 Logistic Regression
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family = bino
mial)
summary(glm.fit)
coef(glm.fit)
summary(glm.fit)$coef

#no data because it takes train set


glm.probs=predict(glm.fit,type="response")
glm.probs[1:10]
#see how the qualitative var is split
contrasts(Direction)
#convert probabilities into class labels up/ down for 0.5
glm.pred=rep("No",dim(Smarket)[1])
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
(145+507)/(145+507+457+141)
mean(glm.pred==Direction)
#create a train set and a test set
train=(Year<2005)
Smarket.2005=Smarket[!train,]
dim(Smarket.2005)
Direction.2005=Direction[!train]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag5+Volume,data=Smarket,family = binomial,
subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")#Smarket.2005 test set
dim(Smarket[!train,])
glm.pred=rep("Down",252)
glm.probs[glm.pred>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
mean(glm.pred!=Direction.2005)
#we remone vars that are not important bc their increase in variance is not met
by a reduction in bias
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,subset = train,family=binomial)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
#predict value of Direction for values of the two vars
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="respo
nse")
#4.6.3 LDA
library(MASS)
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
#group means are averages of each predictor in each class
# when the market increases, there is a tendency of the market to go down and a
tendency to go up when
# the market goes down
#the coeffs are used to create a linear combination used to form LDA
#predict() returns three elements:
# class= prediction of LDA about the movement of the market
# posterior = k th column contains the posterior probability that the obs
coresponds to the k th class
# x = linear discriminants
lda.pred=predict(lda.fit,Smarket.2005)
names(lda.pred)
lda.pred$class
lda.pred$posterior
lda.pred$x

lda.class=lda.pred$class
table(lda.class,Direction.2005)
mean(lda.class==Direction.2005)
#apply 50% threshold to the posterior probabilities and recreate the predictions
in lda.pred$class
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
lda.pred$posterior[1:20,1]
lda.class[1:20]
#different probability threshold
sum(lda.pred$posterior[,1]>.9)
#4.6.4 QDA
library(MASS)
qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit# contains the group means but not the coeffs bc QDA involves a quadratic
functions
qda.class=predict(qda.fit,Smarket.2005)$class #same as
table(qda.class,Direction.2005)
(30+121)/(30+121+81+20)
mean(qda.class==Direction.2005)
#4.6.5 KNN
# matrix containing the predictors associated with the training data, labeled tr
ain.x
# matrix containing the predictors associated with the test data, labeled test.x
# a vector containing the class labels for the training observations, labeled tr
ain.Direction
# a value for K, number of nearest neighbors
library(class)
#4.6.6 Application to caravan insurance data
attach(Caravan)
standardized.x=scale(Caravan[,-86])
test=1:1000
train.x=standardized.x[-test,]
test.x=standardized.x[test,]
train.y=Purchase[-test]
test.y=Purchase[test]
set.seed(1)
knn.pred=knn(train.x,test.x,train.y,k=1)
mean(test.y!=knn.pred)
mean(test.y!="No")
table(knn.pred,test.y)
#log reg with .25 threshold
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
glm.probs=predict(glm.fit,Caravan[test,],type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.y)
#4.7 Exercises

#9
library(ISLR)
attach(Weekly)
#a
pairs(Weekly) #year and volume
cor(Weekly)
summary(Weekly)
cor(Weekly[,-9])
#b
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomia
l)
summary(glm.fit) #lag2
#c
glm.probs=predict(glm.fit,type="response")
glm.pred=rep("Down",length(glm.probs))
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
(41+557)/(41+557+430+37)
mean(glm.pred==Direction)
557/(48+557) #up
54/(54+430) #down
#d
train=(Year<2009)
Weekly.test=Weekly[!train,]
glm.fit=glm(Direction~Lag2,data=Weekly,subset=train,family=binomial)
glm.probs=predict(glm.fit,Weekly.test,type="response")
glm.pred=rep("Down",length(glm.probs))
glm.pred[glm.probs>.5]="Up"
Direction.test=Direction[!train]
table(glm.pred,Direction.test)
train = (Year < 2009)
Weekly.0910 = Weekly[!train, ]
glm.fit = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train
)
glm.probs = predict(glm.fit, Weekly.0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(glm.pred, Direction.0910)
(9+56)/(9+56+34+5)
mean(glm.pred==Direction.0910)
#e
library(MASS)
lda.fit=lda(Direction~Lag2,data=Weekly,subset=train)
lda.pred=predict(lda.fit,Weekly.0910)
lda.class=lda.pred$class
table(lda.class,Direction.0910)
(9+56)/(34+56+14)
mean(lda.class==Direction.0910)
#f
qda.fit=qda(Direction~Lag2,data=Weekly,subset = train)
qda.class=predict(qda.fit,Weekly.0910)$class
table(qda.class,Direction.0910)
mean(qda.class==Direction.0910)
#g
library(class)

train.x=as.matrix(Lag2[train])
test.x=as.matrix(Lag2[!train])
train.direction=Direction[train]
set.seed(1)
knn.pred=knn(train.x,test.x,train.direction,k=1)
table(knn.pred,Direction.0910)
(21+31)/(21+31+22+30)
mean(knn.pred==Direction.0910)
#h
#lda
#i

library(class)
train.x=as.matrix(Lag2[train])
test.x=as.matrix(Lag2[!train])
train.direction=Direction[train]
set.seed(1)
knn.pred=knn(train.x,test.x,train.direction,k=15)
mean(knn.pred==Direction.0910)
#11
#a
attach(Auto)
dim=dim(Auto)[1]
mpg01=rep(0,dim)
mpg01
mpg01[mpg>median(mpg)]=1
mpg01
Auto=data.frame(Auto,mpg01)
fix(Auto)
#b
summary(Auto)
cor(Auto[,-12])
pairs(Auto)
#c
train=(year%%2==0)
test=!train
Auto.train=Auto[train,]
Auto.test=Auto[test,]
mpg01.test=mpg01[test]
#cylinders + weight + displacement + horsepower
#d
library(MASS)
lda.fit=lda(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
lda.pred=predict(lda.fit,Auto.test)
mean(lda.pred!=mpg01.test) #12% test error rate
#e
qda.fit=qda(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
qda.class=predict(qda.fit,Auto.test)$class

mean(qda.class!=mpg01.test)#13% test error rate


#f
glm.fit=glm(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
glm.probs=predict(glm.fit,Auto.test)
glm.pred=rep(0,length(glm.probs))
glm.pred[glm.probs>.5]=1
mean(glm.pred!=mpg01.test)
#g
library(class)
train.x=cbind(cylinders,weight,displacement,horsepower)[train,]
test.x=cbind(cylinders,weight,displacement,horsepower)[test,]
train.mpg=mpg01[train]
knn.pred=knn(train.x,test.x,train.mpg,k=1)
mean(knn.pred!=mpg01.test)#15
knn.pred=knn(train.x,test.x,train.mpg,k=10)
mean(knn.pred!=mpg01.test)
knn.pred=knn(train.x,test.x,train.mpg,k=20)
mean(knn.pred!=mpg01.test)
knn.pred=knn(train.x,test.x,train.mpg,k=100)
mean(knn.pred!=mpg01.test)#14 best
#12
#a
power=function()
{2^3}
print(power())
#b
power2=function(x,a)
{x^a}
power2(2,3)
power2(3,8)
#c
#d
power3=function(x,a)
{
result=x^a
return(result)
}
power3(2,4)
#e
x=1:10
plot(x,power3(x,2))
#f
PlotPower = function(x, a) {
plot(x, Power3(x, a))
}
PlotPower(1:10, 3)

#13
library(MASS)
attach(Boston)
summary(Boston)
#??
################################################################################
#################################
#5.3.1 Validation Set Aproach
library(ISLR)
set.seed(1)
train=sample(392,196)
train
attach(Auto)
lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train)
mean((mpg-predict(lm.fit3,Auto))[-train]^2)
set.seed(2)
train=sample(392,196)
lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train)
mean((mpg-predict(lm.fit3,Auto))[-train]^2)
#5.3.2 LOOCV
#we use glm() not lm() bc. we can use cv.glm() for crossvalidation
glm.fit=glm(mpg~horsepower,data=Auto)
coef(glm.fit)
lm.fit=lm(mpg~horsepower,data=Auto)
coef(lm.fit)
#they are the same
library(boot)
glm.fit=glm(mpg~horsepower,data=Auto)
cv.err=cv.glm(Auto,glm.fit)
cv.err$delta
#delta cointains cv results
#for loop for polynomial fits
cv.error=rep(0,5)
for(i in 1:5)
{glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error[i]=cv.glm(Auto,glm.fit)$delta[1]
}
cv.error

#5.3.3 K-fold Cross Validations


set.seed(17)
cv.error.10=rep(0,10)
for(i in 1:10)
{
glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
}
cv.error.10
#5.3.4 The Bootstrap
alpha.fn=function(data,index)
{
X=data$X[index]
Y=data$Y[index]
return( ( var(Y)-cov(X,Y) ) /

var(X)+var(Y)-2*cov(X,Y)

}
alpha.fn(Portfolio,1:100)
#this is automated using the boot() function
boot(Portfolio,alpha.fn,R=1000)
#create a function that takes a set and indices and returns slope and intercept
boot.fn=function(data,index)
{
return(coef(lm(mpg~horsepower,data=data,subset=index)))
}
boot.fn(Auto,1:392)
boot.fn(Auto,sample(392,392,replace=T))
boot(Auto,boot.fn,1000)
summary(lm(mpg~horsepower,data=data,subset=index))$coef
boot.fn=function (data ,index )
{ coefficients(lm(mpg~horsepower +I( horsepower ^2) ,data=data , subset =index
)) }
set.seed (1)
boot(Auto ,boot.fn ,1000)
##5.4 execises
#5
#a
library(ISLR)
attach(Default)
glm.fit=glm(default~income+balance,data=Default,family = binomial)
#b
FiveB= function()
{
train=sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit=glm(default~income+balance,data=Default,family = binomial,subset=train
)
glm.pred=rep("No",dim(Default)[1]/2)
glm.probs=predict(glm.fit,Default[-train,],type="response")
glm.pred[glm.probs>0.5]="Yes"

return( mean ( glm.pred!=Default[-train,]$default

) )

}
FiveB()
#c
#d
FiveB= function()
{
train=sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit=glm(default~income+balance+student,data=Default,family = binomial,subs
et=train)
glm.pred=rep("No",dim(Default)[1]/2)
glm.probs=predict(glm.fit,Default[-train,],type="response")
glm.pred[glm.probs>0.5]="Yes"
return( mean ( glm.pred!=Default[-train,]$default
) )
}
FiveB()
#6
#a
library(ISLR)
attach(Default)
set.seed(1)
glm.def=glm(default~income+balance,data=Default,family = binomial)
summary(glm.def)
#b
boot.fn = function(data, index) return(coef(glm(default ~ income + balance, data
= data, family = binomial, subset = index)))

#c
library(boot)
boot(Default,boot.fn,50)
#d
#same
#8
#a
set.seed (1)
y=rnorm (100)
x=rnorm (100)
y=x-2* x^2+ rnorm (100)
#b
plot(x,y)
#c
Data=data.frame(x,y)
glm.fit=glm(y~x)

cv.glm(Data,glm.fit)$delta
glm.fit=glm(y~poly(x,2))
#d
set.seed(2)
Data=data.frame(x,y)
glm.fit=glm(y~x)
cv.glm(Data,glm.fit)$delta
glm.fit=glm(y~poly(x,2))
#9
#a
attach(Boston)
################################################################################
###################################
#6.5 Lab 1: Subset Selection Methods
library(ISLR)
attach(Hitters)
sum(is.na(Hitters))
Hitters=na.omit(Hitters)
sum(is.na(Hitters))
#regsubsets() used for best subset selection using RSS [same syntax as lm]
library(leaps)
regfit.full=regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
#the function reports the best model up to eight variables but var no can be cha
nged using nvmax=...
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax = 19)
reg.summary=summary(regfit.full)
reg.summary
#we can use R SQ, RSS, Adj R sq, Cp and BIC to asses the model
names(reg.summary)
reg.summary$rsq
#plot R SQ., Adj. R, Cp and BIC at once to asses the model
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="no of var",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max (reg.summary$adjr2)
points(11,reg.summary$adjr2[11],col="red",cex=2,pch=20)#plot on the plot the bes
t point
plot(reg.summary$cp,xlab="no of var",ylab="CP",type="l")
which.min(reg.summary$cp)
points(10,reg.summary$cp[10],col="blue",cex=3,pch=21)
plot(reg.summary$bic,xlab="no of var",ylab="BIC",type="l")
which.min(reg.summary$bic)
points(6,reg.summary$bic[6],col="blue",cex=3,pch=21)
plot(regfit.full ,scale ="r2")

plot(regfit.full ,scale =" adjr2 ")


plot(regfit.full ,scale ="Cp")
plot(regfit.full ,scale ="bic ")
coef(regfit.full,6)
coef(regfit.full,8)
#6.5.2 Forward and Backward Stepwise Selection
#we see that the models up to six variables are identical for best subset,forwa
rd, and backward selection
#forward selection
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
#backward selection
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax = 19,method="backward")
summary(regfit.bwd)
coef(regfit.full,7)
coef(regfit.fwd,7)
coef(regfit.bwd,7)
#6.5.3 Choosing Among Models Using the Validation Set Approach and Cross-Valida
tion
set.seed(1)
train=sample(c(TRUE,FALSE),nrow(Hitters),rep=T)
train
test=!train
#now apply regsubsets() to perform best subset selection
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
#we now compute MSE for the best model of each size
# to do this we run a loop, and for each size i we extract the coeffs from regfi
t.best for the model of that size,
# multiply them into the appropriate columns of the test model matrix[ model.mat
rix() ] , THEN compute MSE
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for (i in 1:19)
{
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean( (Hitters$Salary[test]-pred)^2
)
}
val.errors
which.min(val.errors)
coef(regfit.full,10)
####################
coefi=coef(regfit.best,id=2)
coefi
names(coefi)
pred=test.mat[,names(coefi)]%*%coefi
a<-test.mat[,names(coefi)]
fix(a)
fix(pred)
#####################

#finnaly we perform best subset on the full data set, and select the best 10 va
riable model!
reg.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(reg.full,10)
##CV
#we create a vector that allocates each observation to one of k=10 folds and cre
ate a matrix that stores the results
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=T)
folds
cv.errors=matrix(NA,k,19,dimnames = list(NULL,paste(1:19)))
cv.errors
#
cv.errors=matrix(NA,k,19)
cv.errors
#
#we write a loop that performs cross validation.
for(j in 1:k)
{
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19)
{
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean((Hitters$Salary[folds==j]-pred )^2)
}
}
#
for(j in 1:k){
best.fit =regsubsets (Salary~.,data=Hitters [folds !=j,],
nvmax =19)
for(i in 1:19) {
pred=predict (best.fit ,Hitters [folds ==j,], id=i)
cv.errors [j,i]=mean( (Hitters$Salary[folds ==j]-pred)^2)
}
}
#
cv.errors
best.fit =regsubsets (Salary~.,data=Hitters [folds !=j,],nvmax =19)
coef(best.fit,11)
##6.6 Lab 2: Ridge Regression and the Lasso
#we use the glmnet() func to fit ridge and lasso. the function needs x as an mat
rix and y as a vector
#we remove missing values
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary

#model.matrix() produces a matrix corresp to the 19 predictors as well as turnin


g all qualitative vars into dummy
# bc glmnet() takes only quantitative inputs
#6.6.1 RIDGE
#glmnet() has an arg. alpha that for 0 performs ridge and lasso for 1
library(glmnet)
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
#the func performs ridge reg for an automaticaly selected range of lambda. here
we have chosen to implement the fun
#ction over a range o values ranging from the null model(only the intercept) to
the least squares fit
# !! [ very high value = null model / lammbda=0 is least squares(+arg. exact=T)
]
#with each value of lambda there is a vector of ridge regr coeffs ,stored in a m
atrix.
#here we have a 20X100 matrix, with 20 rows(one for each predictor+intercept) an
d 100columns for each value of lambda
dim(coef(ridge.mod))
#the coeff estimates are much smaller in terms of l2 norm when a larger value of
lambda is used
#and bigger when a small value of lambda is used
ridge.mod$lambda[50]
coef(ridge.mod)[,50]
ridge.mod$lambda[60]
coef(ridge.mod)[,60]
#we can use the predict function to obtain ridge reg coeffs for a new value of l
ambda,say 50
predict(ridge.mod,s=50,type="coefficients")[1:20,]
# we split the data into a training set and a test set . there are two methods t
o do this
# 1. produce a random vector of true and false, and select elements coresponding
to true for the training data
# 2. randomly choose a subset of numbers between 1 and n and use them as indices
for the training observations
set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
train
test=-train
y.test=y[test]
#we fit a ridge regression on the training set and test it on the test set , usi
ng lambda =4
# in the predict() funct we replace "coefficients" with the newx argument
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid,thresh = 1e-12)
ridge.pred=predict(ridge.mod,s=4,newx = x[test,])
mean( (ridge.pred-y.test)^2 )
#fit a model with just the intercept (very large value of lambda 1e10 = 10^10)
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean ( (ridge.pred-y.test)^2 )
# lambda = 4 leads to much lower MSE than just an intercept
# we now check if there is any benefit to using lambda = 4 instead of using a le
ast square regression

ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
mean ( (ridge.pred-y.test)^2 )
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
#instead of choosing lambda by hand we can use cross validation with the cv.glmn
et(), which performs ten fold
#validation [ no. of folds can be changed using nfolds argument.]
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
#212 is best lambda value
#MSE for this value of lambda is
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean( (ridge.pred-y.test)^2 )
#fit model on full data set and get coeffs
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
## 6.6.2 The Lasso
library(glmnet)
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda = grid)
plot(lasso.mod)
#Cv lasso
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean(
(lasso.pred-y.test)^2
)
#apply on full set
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef
#6.7 Lab 3: PCR and PLS Regression
#6.7.1 Principal Components Regression
library(pls)
set.seed(2)
pcr.fit=pcr(Salary~.,data=Hitters,scale=T,validation="CV") # scale = T is scalin
g and CV is ten fold CV
summary(pcr.fit) # 38.31 % explained by one variable ....
## !!!!! PCR reports root MSE , so we have to square this quantity to obtain rea
l MSE
validationplot(pcr.fit,val.type="MSE") #CV scores
#perform PCR on the training data and evaluate its test performance
set.seed(1)
pcr.fit=pcr(Salary~.,data=Hitters,scale=T,subset=train,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(pcr.fit,x[test,],ncomp = 7)
mean( (pcr.pred-y.test)^2 )

#pcr on the full data set


pcr.fit=pcr(y~x,scale=T,ncomp=7)
summary(pcr.fit)
#6.7.2 Partial Least Squares
set.seed(1)
pls.fit=plsr(Salary~.,data=Hitters,subset=train,scale=T,validation="CV")
summary(pls.fit)
#lowest MSE when M=2
pls.pred=predict(pls.fit,x[test,],ncomp=2)
mean( (pls.pred-y.test)^2 )
#PLS on full data set with M=2
pls.fit=plsr(Salary~.,data=Hitters,subset=train,scale=T,ncomp=2)
summary(pls.fit)
###### EXERCISES ###############
##8
#a
set.seed(1)
x=rnorm(100)
eps=rnorm(100)
x
eps
#b 0=3, 1=2, 2=-3 and 3=0.3.
beta0=3
beta1=2
beta2=-3
beta3=0.3
y=beta0+beta1*x+beta2*x^2+beta3*x^3+eps
#c
data=data.frame(x,y)
fix(data)
library(leaps)
regfit.full=regsubsets(y~poly(x,10,raw=T),data=data,nvmax = 10)
reg.summary=summary(regfit.full)
#adj r sq
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(reg.summary$adjr2)
points(3,reg.summary$adjr2[3],cex=4,pch=33)
#CP
plot(reg.summary$cp,xlab="no of var",ylab="cp",type="l")
which.min(reg.summary$cp)
points(3,reg.summary$cp[3],cex=4,pch=33)
#BIC
plot(reg.summary$bic,xlab="no of var",ylab="BIC",type="l")
which.min(reg.summary$bic)
points(3,reg.summary$adjr2[3],cex=4,pch=33)
coefficients(reg.full,id=3)
#d
regfit.fwd=regsubsets(y~poly(x,10,raw=T),data=data,nvmax=10,method="forward")
regfit.sum=summary(regfit.fwd)
#adj r sq
plot(regfit.sum$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(regfit.sum$adjr2)

points(3,regfit.sum$adjr2[3],cex=4,pch=33)
#CP
plot(regfit.sum$cp,xlab="no of var",ylab="CP",type="l")
which.min(regfit.sum$cp)
points(3,regfit.sum$cp[3],cex=4,pch=33)
#BIC
plot(regfit.sum$bic,xlab="no of var",ylab="bic",type="l")
which.min(regfit.sum$bic)
points(3,regfit.sum$bic[3],cex=4,pch=33)
coefficients(regfit.fwd,id=3)
regfit.bwd=regsubsets(y~poly(x,10,raw=T),data=data,nvmax=10,method="backward")
regfit.sum1=summary(regfit.bwd)
#adj r sq
plot(regfit.sum1$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(regfit.sum1$adjr2)
points(4,regfit.sum1$adjr2[3],cex=4,pch=33)
#CP
plot(regfit.sum1$cp,xlab="no of var",ylab="CP",type="l")
which.min(regfit.sum1$cp)
points(3,regfit.sum1$cp[3],cex=4,pch=33)
#BIC
plot(regfit.sum1$bic,xlab="no of var",ylab="bic",type="l")
which.min(regfit.sum1$bic)
points(3,regfit.sum1$bic[3],cex=4,pch=33)
coefficients(regfit.bwd,id=3)
coefficients(regfit.bwd,id=4)
#e
library(glmnet)
lasso.mod=glmnet()
xmat=model.matrix(y~poly(x,10,raw=T),data=data)[,-1]
fix(data)
cv.out=cv.glmnet(xmat,y,alpha=1)
best.lam=cv.out$lambda.min
plot(cv.out)
#fit model on the full data using best lambda found
cv.out=cv.glmnet(xmat,y,alpha=1)
predict(cv.out,s=best.lam,type="coefficients")
#f
beta7 = 7
y = beta0 + beta7 * x^7 + eps
data=data.frame(x,y)
regfit.full=regsubsets(y~poly(x,10,raw=T),data=data,nvmax = 10)
reg.summary=summary(regfit.full)
which.max(reg.summary$adjr2)
which.min(reg.summary$cp)
which.min(reg.summary$bic)
coefficients(regfit.full,id=4)
coefficients(regfit.full,id=2)
coefficients(regfit.full,id=1)

xmat=model.matrix(y~poly(x,10,raw=T),data=data)[,-1]
mod.lasso=cv.glmnet(xmat,y,alpha=1)
best.lam=mod.lasso$lambda.min
best.lam
best.model=cv.glmnet(xmat,y,alpha=1)
predict(best.model,s=best.lam,type="coefficients")
##9
#a
library(ISLR)
rm(College)
fix(College)
sum(is.na(College))
set.seed(11)
train.size=dim(College)[1]/2
train=sample(1:dim(College)[1],train.size)
dim(College)
train.size
dim(College)[1]
college.train=College[train,]
college.test=College[-train,]
#b
lm.fit=lm(Apps~.,data=college.train)
lm.pred=predict(lm.fit,college.test)
mean(
(college.test[,"Apps"]-lm.pred)^2
college.test[,"Apps"]

#c
library(glmnet)
train.mat=model.matrix(Apps~.,data=college.train)
test.mat=model.matrix(Apps~.,data=college.test)
grid=10^seq(4,-2,length=100)
ridge.mod=cv.glmnet(train.mat,college.train[,"Apps"],alpha=0,lambda=grid,thresh
= 1e-12)
lambda.best=ridge.mod$lambda.min
lambda.best
ridge.pred=predict(ridge.mod,s=lambda.best,newx = test.mat)
mean( (College.test[,"Apps"]-ridge.pred)^2 )

#d
mod.lasso=cv.glmnet(train.mat,college.train[,"Apps"],alpha=1,lambda=grid,thresh=
1e-12)
lambda.best=mod.lasso$lambda.min
lambda.min
lasso.pred=predict(mod.lasso,newx=test.mat,s=lambda.best)
mean( (College.test[,"Apps"]-lasso.pred)^2 )
#coefs
mod.lasso=glmnet(model.matrix(Apps~.,data=College),College[,"Apps"],alpha=1)
predict(mod.lasso,s=lambda.best,type="coefficients")
#e

library(pls)
cr.fit=pcr(Apps~.,data=college.train,scale=T,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(cr.fit,college.test,ncomp=10)
mean( (College.test[,"Apps"]-data.frame(pcr.pred))^2 )
#f
cr.fit=plsr(Apps~.,data=college.train,scale=T,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(cr.fit,college.test,ncomp=10)
mean( (College.test[,"Apps"]-data.frame(pcr.pred))^2 )
#g
test.avg = mean(College.test[, "Apps"])
lm.test.r2 = 1 - mean((College.test[, "Apps"] - lm.pred)^2) /mean((College.test[
, "Apps"] - test.avg)^2)
ridge.test.r2 = 1 - mean((College.test[, "Apps"] - ridge.pred)^2) /mean((College
.test[, "Apps"] - test.avg)^2)
lasso.test.r2 = 1 - mean((College.test[, "Apps"] - lasso.pred)^2) /mean((College
.test[, "Apps"] - test.avg)^2)
pcr.test.r2 = 1 - mean((College.test[, "Apps"] - data.frame(pcr.pred))^2) /mean(
(College.test[, "Apps"] - test.avg)^2)
pls.test.r2 = 1 - mean((College.test[, "Apps"] - data.frame(pls.pred))^2) /mean(
(College.test[, "Apps"] - test.avg)^2)
barplot(c(lm.test.r2, ridge.test.r2, lasso.test.r2, pcr.test.r2, pls.test.r2), c
ol="red", names.arg=c("OLS", "Ridge", "Lasso", "PCR", "PLS"), main="Test R-squar
ed")
##10
#a
set.seed(1)
n=1000
p=20
x=matrix(rnorm(n*p),n,p)
x
b=rnorm(p)
b[3]=0
b[4]=0
b[9]=0
b[19]=0
b[10]=0
eps=rnorm(p)
y=x*b+eps
plot(x)
#b
set.seed(1)
train=sample(seq(1000),100,replace=F)
seq(1000)
y.train=y[train]
y.test=y[-train]
x.train=x[train,]
x.test=x[-train,]
#c
#Perform best subset selection on the training set, and plot the
#training set MSE associated with the best model of each size.

library(leaps)
regfit.full = regsubsets(y ~ ., data = data.frame(x = x.train, y = y.train), nvm
ax = p)
val.errors = rep(NA, p)
x_cols = colnames(x, do.NULL = FALSE, prefix = "x.")
x_cols
for (i in 1:p) {
coefi = coef(regfit.full, id = i)
pred = as.matrix(x.train[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %
in% x_cols]
pred
as.matrix(x.train[,x_cols %in% names(coefi) ]) * coefi[names(coefi) %in% x_col
s]
val.errors[i] = mean((y.train - pred)^2)
}
plot(val.errors, ylab = "Training MSE", pch = 19, type = "b")
#d
val.errors = rep(NA, p)
for (i in 1:p) {
coefi = coef(regfit.full, id = i)
pred = as.matrix(x.test[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %i
n% x_cols]
val.errors[i] = mean((y.test - pred)^2)
}
plot(val.errors, ylab = "Test MSE", pch = 19, type = "b")
#e
which.min(val.errors)
#f
coef(regfit.full,id=16)
#g
##11
#a
#best subset
set.seed(1)
library(MASS)
library(leaps)
library(glmnet)
rm(Boston)
fix(Boston)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])

mat = model.matrix(form, newdata)


coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 19, type = "b")
#b
#lasso
attach(Boston)
xmat=model.matrix(crim~.-1,data=Boston)
cv.out=cv.glmnet(xmat,Boston$crim,type.measure = "mse")
plot(cv.out)
coef(cv.out)
sqrt(cv.out$cvm[cv.out$lambda==cv.out$lambda.1se] )
#ridge
cv.ridge=cv.glmnet(xmat,Boston$crim,type.measure = "mse",alpha=0)
plot(cv.ridge)
coef(cv.ridge)
sqrt( cv.ridge$cvm[cv.ridge$lambda==cv.ridge$lambda.lse])
#pcr
library(pls)
pcr.fit=pcr(crim~.,data=Boston,scale=T,validation="CV")
summary(pcr.fit)
################################################################################
###################################

#POLYNOMIAL REGRESSION : extends the linear model by adding extra predictors ,


obtained by raising each of the original
# predictors to a power. For ex. a cubic regression uses three variables , x,x^2
,x^3 as predictors.
#STEP FUNCTIONS: cut the range of a variable into K distinct regions in order to
produce a qualitative variable.
#REGRESSION SPLINES : they are an extension of polynomial and step functions . t
hey involve dividing the range of X
# into K distinct regions. within each region, a polynomial function is fit , b
ut they are constrained so that they
# join smoothly at the region boundaries (knots). provided the region is divided
into enough regions, they can provide
# an extremly flexible fit.

#SMOOTHING SPLINES:
similar to regression splines, but result from minimizin
g a RSS subject to a smoothness penalty
#LOCAL REGRESSION: similar to splines but the regions are allowed to overlap
#GAM's: allow the extension of the above methods in order to deal with multiple
predictors
################################################################################
###################################
# 7.8 Lab: Non-linear Modeling
library(ISLR)
attach(Wage)
#7.8.1 Polynomial Regression and Step Functions
fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
fit2=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage)
#create a grid of values for age at which we want predictions
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
preds=predict(fit,newdata = list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands
plot(age,wage,col="red")
title("Degree 4 polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,col="blue")
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
fit.1= lm(wage~education +age ,data=Wage)
fit.2= lm(wage~education +poly(age ,2) ,data=Wage)
fit.3= lm(wage~education +poly(age ,3) ,data=Wage)
anova(fit.1, fit.2, fit.3)
#STEP Functions
# Next we consider the task of predicting whether an individual earns more
# than $250,000 per year.
fit=glm(I(wage>250)~poly(age,4),data=Wage,family = binomial)
preds=predict(fit,newdata = list(age=age.grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))

se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(1+se.bands.logit))
plot(age,I(wage>250),xlim=agelims,ylim=c(0,.2),type="n")
points(jitter(age),I((wage>250)/5),pch="|",col="darkgrey")
lines(age.grid,pfit,col="blue")
matlines(age.grid,se.bands,col="blue")
# We have drawn the age values corresponding to the observations with wage
# values above 250 as gray marks on the top of the plot, and those with wage
# values below 250 are shown as gray marks on the bottom of the plot.
table(cut(age,4))
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
#The age<33.5 category is left out, so the intercept coefficient of
#$94,160 can be interpreted as the average salary for those under 33.5 years
#of age, and the other coefficients can be interpreted as the average additional
#salary for those in the other age groups.

#7.8.2 Splines
#fit wage to age using a regression spline. by default, cubic regression splines
are used
attach(Wage)
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
pred=predict(fit,newdata = list(age=age.grid),se=T)
plot(age,wage,col="red")
lines(age.grid,pred$fit,lwd=4)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
#here we have made knots at 25,40,60 which produces a spline with six basis func
tions .
# a cubic spline with three knots has seven degrees of freedom - one for interce
pt plus six basis functions
attr(bs(age,df=6),"knots")
#r chooses the splits
#bs has arg df which chooses the degree of the function rather that the default
cubic one
#NATURAL SPLINES
#ns with 4 df
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit,newdata=list(age=age.grid),se=T)
lines(age.grid,pred2$fit,col="blue",lwd=4)
#as with the bs , we could have specified the knots dirrectly using knots
#SMOOTHING SPLINE
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16)
fit1=smooth.spline(age,wage,cv=T)
fit1$df
lines(fit,col="red")

lines(fit1,col="blue")
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex
=.8)
#when we specified df 16 the funct calculates the value of lambda needed for 16
df
#when we selected df chosen by CV which yields a df of 6.8
#LOCAL REGRESSION
plot(age,wage,xlim=agelims,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage)
fit1=loess(wage~age,span=.5,data=Wage)
lines(age.grid,predict(fit,newdata=data.frame(age=age.grid)))
lines(age.grid,predict(fit1,newdata=data.frame(age=age.grid)))
#local regr with spans .2 and .5 . each neighborhood consists of 20% or 50% of t
he observations.
# the longer the span the smoother the regression
#GAMS
#we fit a GAM to predict wage using natural spline functions of year and age , t
reating education as a qualitative pred
gam1=lm(wage~ns(year,4)+ns(age,5)+education,data=Wage)
#we now fit a model that uses smoothing splines, rather than natural splines .
#we need to use the gam() function
library(gam)
# s() is used for smooothing spline
gam.m3=gam(wage~s(year,4)+s(age,4)+education,data=Wage)
par(mfrow=c(1,3))
plot(gam.m3,se=T)
plot(gam1)
plot.gam(gam1,se=T)
#m1 gam that excludes year
#m2 gam that uses linear funct of year
#m3 gam that uses a spline function
gam.m1=gam(wage~s(age,5)+education,data=Wage)
gam.m2=gam(wage~year+s(age,5)+education,data=Wage)
gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage)
anova(gam.m1,gam.m2,gam.m3,test="F")
summary(gam.m3)
#the p values of the model reinforce the ideea that a linear model is needed for
year and a non linear one for age
#predictions on the training set
preds=predict(gam.m2,newdata=Wage)
# we can also use local regression as the building blocks of GAM with the lo() f
gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education,data=Wage)
plot.gam(gam.lo,se=T)
gam.lo.i=gam(wage~lo(year,age,span=0.5),data=Wage)
library(akima)
plot(gam.lo.i)

par(mfrow=c(1,1))
#gams with log reg
gam.lr=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=wage)
gam.lr=gam(I(wage >250)~year+s(age ,df =5)+education ,family =binomial ,data=Wag
e)
par(mfrow =c(1,3))
plot(gam.lr,se=T)
table(education,I(wage>250))
gam.lr=gam(I(wage >250)~year+s(age ,df =5)+education ,family =binomial ,data=Wag
e,subset=(education!="1. < HS Grad"))
plot(gam.lr,se=T)
################################################################################
#####################################
#7.9 Execrise
#6
#a
set.seed(1)
library(boot)
all.deltas=rep(NA,10)
for(i in 1:10)
{
glm.fit=glm(wage~poly(age,i),data=Wage)
all.deltas[i]=cv.glm(Wage,glm.fit,K=10)$delta[2]
}
all.deltas
plot(1:10,all.deltas,xlab="degree",ylab="CV error",type="b",ylim=c(1590,1700))
min.point=min(all.deltas)
sd.point=sd(all.deltas)
abline(h=min.point+0.2*sd.point,lty="dashed")
abline(h=min.point-0.2*sd.point,lty="dashed")
legend("topright","0.2 sd line ",lty="dashed")
# 3 cuts
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
preds=predict(fit,data.frame(age=age.grid))
lm.fit=lm(wage~poly(age,3),data=Wage)
plot(wage~age,data=Wage)
lines(age.grid,preds,col="blue",lwd=3)

fit.1=lm(wage~poly(age,1),data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
fit.6=lm(wage~poly(age,6),data=Wage)
fit.7=lm(wage~poly(age,7),data=Wage)
fit.8=lm(wage~poly(age,8),data=Wage)
fit.9=lm(wage~poly(age,9),data=Wage)
fit.10=lm(wage~poly(age,10),data=Wage)

anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9,fit.10)
#b
all.cvs=rep(NA,10)
for(i in 2:10)
{
Wage$age.cut=cut(Wage$age,i)
lm.fit=glm(wage~age.cut,data=Wage)
all.cvs[i]=cv.glm(Wage,lm.fit,K=10)$delta[2]
}
all.cvs
plot(2:10,all.cvs[-1],xlab="no of cuts",ylab="cv err",type="b")
#8 cuts
lm.fit=glm(wage~cut(age,8),data=Wage)
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
lm.pred=predict(lm.fit,data.frame(age=age.grid))
plot(wage~age,data=Wage)
lines(age.grid,lm.pred,col="red",lwd=4)
##7
#a
set.seed(1)
summary(Wage$maritl)
plot(Wage$maritl)
summary(Wage$jobclass)
plot(Wage$jobclass)
par(mfrow=c(1,2))
plot(Wage$maritl,Wage$wage)
plot(Wage$jobclass,Wage$wage)
fit=lm(wage~maritl,data=Wage)
deviance(fit)
fit=lm(wage~jobclass,data=Wage)
deviance(fit)
fit=lm(wage~maritl+jobclass,data=Wage)
deviance(fit)
#gam
fit=gam(wage~maritl+jobclass+s(age,4),data=Wage)
deviance(fit)
##8
pairs(Auto)
#mpg inv prop to cyl displ horesp weight
cv.errs=rep(NA,10)
for (i in 1:10)
{
fit=glm(mpg~poly(displacement,i),data=Auto)

cv.errs[i]=cv.glm(Auto,fit,K=10)$delta[2]
}
cv.errs
which.min(cv.errs)
#10 th degree polynomial
attach(Auto)
plot(displacement,mpg)
lm.fit.poly=glm(mpg~poly(displacement,10),data=Auto)
summary(displacement)
disprange=range(displacement)
disprange
disp.grid=seq(from=disprange[1],to=disprange[2])
preds=predict(lm.fit.poly,data.frame(displacement=disp.grid))
lines(disp.grid,preds,col="red",lwd=5)
pol1=lm(mpg~poly(displacement,1),data=Auto)
pol2=lm(mpg~poly(displacement,2),data=Auto)
pol3=lm(mpg~poly(displacement,3),data=Auto)
pol4=lm(mpg~poly(displacement,4),data=Auto)
pol5=lm(mpg~poly(displacement,5),data=Auto)
pol6=lm(mpg~poly(displacement,6),data=Auto)
pol7=lm(mpg~poly(displacement,7),data=Auto)
pol8=lm(mpg~poly(displacement,8),data=Auto)
pol9=lm(mpg~poly(displacement,9),data=Auto)
pol10=lm(mpg~poly(displacement,10),data=Auto)
anova(pol1,pol2,pol3,pol4,pol5,pol6,pol7,pol8,pol9,pol10)
#step f
all.cvs=rep(NA,10)
for(i in 2:10)
{
Auto$dis.cut=cut(Auto$displacement,i)
lm.fit=glm(mpg~dis.cut,data=Auto)
all.cvs[i]=cv.glm(Auto,lm.fit,K=10)$delta[2]
}
all.cvs
plot(1:10,all.cvs,xlab="degree",ylab="CV error",type="b")
which.min(all.cvs)
#9 cuts
disprange=range(displacement)
disprange
disp.grid=seq(from=disprange[1],to=disprange[2])
preds=predict(lm.fit,data.frame(displacement=disp.grid))
plot(displacement,mpg)
lines(disp.grid,preds,col="red",lwd=5)
#splines
library(splines)
cv.errs=rep(NA,10)
for(df in 3:10)
{
fit=glm(mpg~ns(displacement,df=df),data=Auto)
cv.errs[df]=cv.glm(Auto,fit,K=10)$delta[2]
}
cv.errs
which.min(cv.errs)

plot(wa)
plot(displacement,mpg)
fit2=lm(mpg~ns(displacement,df=10),data=Auto)
pred2=predict(fit2,newdata=list(displacement=disp.grid),se=T)
lines(disp.grid,pred2$fit,col="blue",lwd=4)
#gams
fit = gam(mpg ~ s(displacement, 4) + s(horsepower, 4), data = Auto)
summary(fit)
##9
#a
poly.fit=lm(nox~poly(dis,3),data=Boston)
attach(Boston)
dis.range=range(dis)
dis.range
dis.grid=seq(from=dis.range[1],to=dis.range[2])
preds=predict(poly.fit,data.frame(dis=dis.grid))
plot(dis,nox)
lines(dis.grid,preds,col="red",lwd=5)
title("Pen-Pineapple-Apple-Pen")
#b
all.rss=rep(NA,10)
for (i in 1:10)
{
poly.fit=lm(nox~poly(dis,i),data=Boston)
all.rss[i]=sum(poly.fit$residuals^2)
}
all.rss
plot(1:10,all.rss,type="b")
#c
library(boot)
all.rss=rep(NA,10)
for (i in 1:10)
{
poly.fit=glm(nox~poly(dis,i),data=Boston)
all.rss[i]=cv.glm(Boston,poly.fit,K=10)$delta[2]
}
all.rss
plot(1:10,all.rss,xlab="no of deg",ylab="cv err",type="b")
which.min(all.rss)
#4 knots
#d
library(splines)
sp.fit=lm(nox~bs(dis,df=4,knots=c(4,7,11)),data=Boston)
summary(sp.fit)
sp.pred=predict(sp.fit,list(dis=dis.grid))
plot(nox~dis,data=Boston)
plot(dis,nox)
lines(dis.grid,sp.pred,col="blue",lwd=3)

#e
all.cvs=rep(NA,16)
for(i in 3:16)
{
lm.fit=lm(nox~bs(dis,df=i),data=Boston)
all.cvs[i]=sum(lm.fit$residuals^2)
}
all.cvs
which.min(all.cvs)
#f
all.cv = rep(NA, 16)
for (i in 3:16) {
lm.fit = glm(nox ~ bs(dis, df = i), data = Boston)
all.cv[i] = cv.glm(Boston, lm.fit, K = 10)$delta[2]
}
all.cv
plot(3:16,all.cv[-c(1,2)],xlab="no of var",ylab="cv err",type="b")
##10
#a
set.seed(1)
library(ISLR)
attach(College)
fix(College)
train=sample(length(Outstate),length(Outstate)/2)
test=-train
college.train=College[train,]
college.test=College[test,]
#fwd stepwise selection on training set
dim(College)
library(leaps)
regfit.fwd=regsubsets(Outstate~.,data=college.train,nvmax=17,method="forward")
reg.summary=summary(regfit.fwd)
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l",ylim=c(0.4,0.84
))
which.max(reg.summary$adjr2)
points(13,reg.summary$adjr2[13],cex=2.5,pch=15)
max.adjr2=max(reg.summary$adjr2)
sd.adjr2=sd(reg.summary$adjr2)
abline(h=max.adjr2+0.2*sd.adjr2,lty="dashed")
abline(h=max.adjr2-0.2*sd.adjr2,lty="dashed")
max.adjr2-0.2*sd.adjr2
max.adjr2+0.2*sd.adjr2
plot(reg.summary$bic,xlab="no of var",ylab="bic",type="l")
which.min(reg.summary$bic)
points(6,reg.summary$bic[6],cex=2.5,pch=15)
sd.bic=sd(reg.summary$bic)
max.bic=max(reg.summary$bic)
abline(h=max.bic+0.2*sd.bic,lty="dashed")
abline(h=max.bic-0.2*sd.bic,lty="dashed")
max.bic+0.2*sd.bic
max.bic-0.2*sd.bic

plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")


which.min(reg.summary$cp)
min.cp = min(reg.summary$cp)
std.cp = sd(reg.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
#6 vars so id = 6
reg.fit=regsubsets(Outstate~.,data=college.train,method="forward")
coefi=coef(reg.fit,id=6)
names(coefi)
#b
library(gam)
gam.fit=gam(Outstate~Private+s(Room.Board,df=2)+s(PhD,df=2)+s(perc.alumni,df=2)+
s(Expend,df=2)+s(Grad.Rate,df=2),data=college.train)
par(mfrow = c(2, 3))
plot(gam.fit, se = T, col = "blue")
#c
gam.pred=predict(gam.fit,college.test)
gamm.err=mean(
(college.test$Outstate-gam.pred)^2 )
gam.err
gam.tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
test.rss = 1 - gam.err/gam.tss
test.rss
#d
summary(gam.fit)
################################################################################
########################################

You might also like