A028 GLM-SC3
A028 GLM-SC3
R
hp
2024-04-15
"
SVKM’S NARSEE MONJEE INSTITUTE OF MANAGEMENT STUDIES (NMIMS)
# Practical 1
# Q1
library(readxl)
q1 <- read_excel("C:/Users/hp/Desktop/GLM.P1Q1.xlsx")
View(q1)
attach(q1)
cor(X1, Y1)
## [1] 0.8164205
cor(X2, Y2)
## [1] 0.8162365
cor(X3, Y3)
## [1] 0.8162867
cor(X4, Y4)
## [1] 0.8165214
##
## Call:
## lm(formula = Y1 ~ X1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## X1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
plot(model1)
model2 <- lm(Y2 ~ X2)
summary(model2)
##
## Call:
## lm(formula = Y2 ~ X2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9009 -0.7609 0.1291 0.9491 1.2691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.667 0.02576 *
## X2 0.500 0.118 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
plot(model2)
model3 <- lm(Y3 ~ X3)
summary(model3)
##
## Call:
## lm(formula = Y3 ~ X3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1586 -0.6146 -0.2303 0.1540 3.2411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0025 1.1245 2.670 0.02562 *
## X3 0.4997 0.1179 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
plot(model3)
model4 <- lm(Y4 ~ X4)
summary(model4)
##
## Call:
## lm(formula = Y4 ~ X4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017 1.1239 2.671 0.02559 *
## X4 0.4999 0.1178 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
plot(model4)
# ii)
model2 <- lm(data2$Sunday ~ data2$Daily)
# iii)
confint(model2, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -59.094743 86.766003
## data2$Daily 1.195594 1.483836
# iv)
anova(model2)
# v)
summary(model2)
##
## Call:
## lm(formula = data2$Sunday ~ data2$Daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -255.19 -55.57 -20.89 62.73 278.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.83563 35.80401 0.386 0.702
## data2$Daily 1.33971 0.07075 18.935 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109.4 on 32 degrees of freedom
## Multiple R-squared: 0.9181, Adjusted R-squared: 0.9155
## F-statistic: 358.5 on 1 and 32 DF, p-value: < 2.2e-16
# Q3
x = seq(1,10,0.5);x
## [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5
8.0
## [16] 8.5 9.0 9.5 10.0
y = rnorm(19,50+10*x,sd = 4);y
sxx = sum(x^2)-(sum(x)^2/19);sxx
## [1] 142.5
## [1] 142.5
# t-value
qt(0.975,19-2)
## [1] 2.109816
sam = 500
coef = matrix(nrow = sam,ncol = 2)
y = c()
## [1] 500 19
for(i in 1:sam){
y_data[i,] = rnorm(19,50+10*x,sd = 4)
fit = lm(y_data[i,]~x)
coef[i,] = fit$coefficients # We get CI
deviance(fit) # SS of residual
ci_b1_lower[i] = coef[i,2] - qt(0.975,17)*(sqrt(deviance(fit)/(17*sxx)))
ci_b1_upper[i] = coef[i,2] + qt(0.975,17)*(sqrt(deviance(fit)/(17*sxx)))
}
head(coef)
## [,1] [,2]
## [1,] 51.78338 9.695196
## [2,] 47.46923 10.370200
## [3,] 50.73365 9.991049
## [4,] 51.19920 9.816722
## [5,] 54.21473 9.704519
## [6,] 50.51609 9.975716
dim(coef)
## [1] 500 2
ci = data.frame(ci_b1_lower,ci_b1_upper)
head(ci)
## ci_b1_lower ci_b1_upper
## 1 8.903227 10.48716
## 2 9.654436 11.08596
## 3 9.300826 10.68127
## 4 9.119546 10.51390
## 5 8.776359 10.63268
## 6 9.256526 10.69491
dim(ci)
## [1] 500 2
count = 0
for(i in 1:500){
if(ci[i,1]>10|ci[i,2]<10){
count = count + 1
}
}
print(count)
## [1] 30
View(y_data)
# Practical 2
# Q1
# install.packages("MPV")
library(MPV)
## randomForest 4.7-1.1
data("table.b5")
data <- table.b5
model <- lm(y ~ x6 + x7, data = data)
summary(model)
##
## Call:
## lm(formula = y ~ x6 + x7, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.2035 -4.3713 0.2513 4.9339 21.9682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.526460 3.610055 0.700 0.4908
## x6 0.018522 0.002747 6.742 5.66e-07 ***
## x7 2.185753 0.972696 2.247 0.0341 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.924 on 24 degrees of freedom
## Multiple R-squared: 0.6996, Adjusted R-squared: 0.6746
## F-statistic: 27.95 on 2 and 24 DF, p-value: 5.391e-07
confint(model)
## 2.5 % 97.5 %
## (Intercept) -4.92432697 9.97724714
## x6 0.01285196 0.02419204
## x7 0.17820756 4.19329833
t.test(data$x6, data$x7)
##
## Welch Two Sample t-test
##
## data: data$x6 and data$x7
## t = 6.9432, df = 26, p-value = 2.263e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 673.1694 1239.3765
## sample estimates:
## mean of x mean of y
## 958.310741 2.037778
##
## Call:
## lm(formula = y ~ x6, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.081 -5.829 -0.839 5.522 26.882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.144181 3.483064 1.764 0.0899 .
## x6 0.019395 0.002932 6.616 6.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 25 degrees of freedom
## Multiple R-squared: 0.6365, Adjusted R-squared: 0.6219
## F-statistic: 43.77 on 1 and 25 DF, p-value: 6.238e-07
confint(model2)
## 2.5 % 97.5 %
## (Intercept) -1.02932458 13.31768586
## x6 0.01335688 0.02543261
anova(model)
anova(model2)
# Q2
y1 <- c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68)
x1 <- c(10,8,13,9,11,14,6,4,12,7,5)
y2 <- c(9.14,8.14,8.74,8.77,9.26,8.10,6.13,3.10,9.13,7.26,4.74)
x2 <- c(10,8,13,9,11,14,6,4,12,7,5)
y3 <- c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73)
x3 <- c(10,8,13,9,11,14,6,4,12,7,5)
y4 <- c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89)
x4 <- c(8,8,8,8,8,8,8,19,8,8,8)
#i)
cor(x1,y1)
## [1] 0.8164205
cor(x2,y2)
## [1] 0.8162365
cor(x3,y3)
## [1] 0.8162867
cor(x4,y4)
## [1] 0.8165214
##
## Call:
## lm(formula = y1 ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## x1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
summary(model2)
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9009 -0.7609 0.1291 0.9491 1.2691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.667 0.02576 *
## x2 0.500 0.118 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
summary(model3)
##
## Call:
## lm(formula = y3 ~ x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1586 -0.6146 -0.2303 0.1540 3.2411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0025 1.1245 2.670 0.02562 *
## x3 0.4997 0.1179 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
summary(model4)
##
## Call:
## lm(formula = y4 ~ x4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017 1.1239 2.671 0.02559 *
## x4 0.4999 0.1178 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
plot(x1,y1)
plot(x2,y2)
plot(x3,y3)
plot(x4,y4)
# b0 = 3.0001,b1 = 0.5001,R^2 = 0.6665
# Although values are same for different data which means slope and intercept
are
# same which is not possible because data is different
# therefore this are not good fit"
# Practical 3
library(MPV)
library(car)
library(MASS)
##
## Attaching package: 'MASS'
library(lmtest)
##
## Attaching package: 'zoo'
library(corrplot)
#fit model
#residual analysis - normality (graph or shapiro), homoscedasticity
#transformation if required (boxcox)
# Q1
par(mfrow = c(1,1))
plot(model1, 2) #Q-Q plot
##
## Shapiro-Wilk normality test
##
## data: model1$residuals
## W = 0.97358, p-value = 0.6981
#Homoscedasticity
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 6.9087, df = 2, p-value = 0.03161
#Reject H0.
#Variance not constant
#PRESS
PRESS(model1)
## [1] 3388.604
boxcox(model1)
#Log-tansformations
logmodel1 <- lm(log(y)~ x6 + x7, data = data)
par(mfrow=c(1,1))
plot(logmodel1,2) #Q-Q plot
plot(logmodel1$fitted.values, logmodel1$residuals) #Residuals vs fitted
#Test for normality
shapiro.test(logmodel1$residuals)
##
## Shapiro-Wilk normality test
##
## data: logmodel1$residuals
## W = 0.9715, p-value = 0.6418
#Homoscedasticity
bptest(logmodel1)
##
## studentized Breusch-Pagan test
##
## data: logmodel1
## BP = 1.1287, df = 2, p-value = 0.5687
PRESS(logmodel1)
## [1] 6.247452
##
## Shapiro-Wilk normality test
##
## data: sqrtmodel1$residuals
## W = 0.98599, p-value = 0.9658
#Homoscedasticity
bptest(sqrtmodel1)
##
## studentized Breusch-Pagan test
##
## data: sqrtmodel1
## BP = 2.8106, df = 2, p-value = 0.2453
PRESS(sqrtmodel1)
## [1] 29.0768
##
## Shapiro-Wilk normality test
##
## data: model2$residuals
## W = 0.96694, p-value = 0.5232
#Homocedasticity
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 9.2433, df = 1, p-value = 0.002364
PRESS(model2)
## [1] 3692.881
boxcox(model2)
#Log-transformations
logmodel2 <- lm(log(y) ~ x6 + x7, data = data)
par(mfrow=c(1,1))
plot(logmodel2, 2) #Q-Q plot
##
## Shapiro-Wilk normality test
##
## data: logmodel2$residuals
## W = 0.9715, p-value = 0.6418
#Homoscedasticity
bptest(logmodel2)
##
## studentized Breusch-Pagan test
##
## data: logmodel2
## BP = 1.1287, df = 2, p-value = 0.5687
PRESS(logmodel2)
## [1] 6.247452
#Square root transformation
sqrtmodel2 <- lm(sqrt(y) ~ x6 + x7, data = data)
##
## Shapiro-Wilk normality test
##
## data: sqrtmodel2$residuals
## W = 0.98599, p-value = 0.9658
#Homoscedasticity
bptest(sqrtmodel2)
##
## studentized Breusch-Pagan test
##
## data: sqrtmodel2
## BP = 2.8106, df = 2, p-value = 0.2453
PRESS(sqrtmodel2)
## [1] 29.0768
#Since, PRESS is least for log transformation, it is better.
# Q2
d2 <- table.b20
View(table.b20)
plot(model, 2)
plot(model$fitted.values, model$residuals)
#Test for normality
shapiro.test(model$residuals)
##
## Shapiro-Wilk normality test
##
## data: model$residuals
## W = 0.97074, p-value = 0.8114
#Homoscedasticity
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 0.72832, df = 5, p-value = 0.9814
#Variance is constant
## [1] 3191.099
boxcox(model)
#Log-transformation
logmodel <- lm(log(y) ~ ., data = d2)
plot(logmodel,2)
plot(logmodel$fitted.values, logmodel$residuals)
#Test for normality
shapiro.test(logmodel$residuals)
##
## Shapiro-Wilk normality test
##
## data: logmodel$residuals
## W = 0.81744, p-value = 0.002707
plot(sqrtmodel, 2)
plot(sqrtmodel$fitted.values, sqrtmodel$residuals)
#Test for normality
shapiro.test(sqrtmodel$residuals)
##
## Shapiro-Wilk normality test
##
## data: sqrtmodel$residuals
## W = 0.93032, p-value = 0.1967
#Homoscedasticity
bptest(sqrtmodel)
##
## studentized Breusch-Pagan test
##
## data: sqrtmodel
## BP = 2.7273, df = 5, p-value = 0.7419
PRESS(sqrtmodel)
## [1] 20.03189
# Q3
d3 <- table.b2
View(d3)
##
## Call:
## lm(formula = y ~ ., data = d3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6848 -2.7688 0.6273 3.9166 17.3962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 325.43612 96.12721 3.385 0.00255 **
## x1 0.06753 0.02899 2.329 0.02900 *
## x2 2.55198 1.24824 2.044 0.05252 .
## x3 3.80019 1.46114 2.601 0.01598 *
## x4 -22.94947 2.70360 -8.488 1.53e-08 ***
## x5 2.41748 1.80829 1.337 0.19433
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.039 on 23 degrees of freedom
## Multiple R-squared: 0.8988, Adjusted R-squared: 0.8768
## F-statistic: 40.84 on 5 and 23 DF, p-value: 1.077e-10
## 4 18 22
## 17.39622 -13.68478 14.35625
X <- as.matrix(X)
dim(X)
## [1] 29 6
p <- ncol(X)
n <- nrow(X)
Threshold <- 2*p/n
Threshold
## [1] 0.4137931
#Cooks Distance
cd <- cooks.distance(model)
## 4
## 1.101218
#DFFITS
## 1 4 8 18 22 24
## 1.4981245 3.1528111 -0.9159602 -1.0225896 1.3846782 -1.2808509
#DFBETAS
# Q4
library(readxl)
data <- read_excel("C:/Users/hp/Desktop/Rabe5.xlsx")
head(data)
## # A tibble: 6 × 7
## Y X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 443 49 79 76 8 15 205
## 2 290 27 70 31 6 6 129
## 3 676 115 92 130 0 9 339
## 4 536 92 62 92 5 8 247
## 5 481 67 42 94 16 3 202
## 6 296 31 54 34 14 11 119
##
## Call:
## lm(formula = Y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.267 -15.427 2.524 13.633 71.438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.1772 22.0755 1.593 0.12058
## X1 2.8547 5.3372 0.535 0.59632
## X2 3.2753 5.3330 0.614 0.54332
## X3 3.1863 5.2887 0.602 0.55098
## X4 3.1878 0.9918 3.214 0.00292 **
## X5 -0.6677 0.8934 -0.747 0.46014
## X6 -1.1658 5.3217 -0.219 0.82794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.5 on 33 degrees of freedom
## Multiple R-squared: 0.9559, Adjusted R-squared: 0.9479
## F-statistic: 119.3 on 6 and 33 DF, p-value: < 2.2e-16
# Adjusted R2 = 0.9479
## 38
## 71.43799
X <- as.matrix(data)
dim(X)
## [1] 40 7
p <- ncol(X)
n <- nrow(X)
Threshold <- 2*p/n
Threshold
## [1] 0.35
## numeric(0)
#1 & 4 are leverage points since hii > 2p/n
#Cooks Distance
cd <- cooks.distance(model)
## named numeric(0)
#DFFITS
## 34 38
## -1.281473 1.480310
# Practical 4
# Q1
library(readxl)
data <- read_excel("C:/Users/hp/Desktop/gasoline.xlsx")
str(data)
attach(data)
##
## Call:
## lm(formula = Y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3498 -1.6236 -0.6002 1.5155 5.2815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.773204 30.508775 0.583 0.5674
## X1 -0.077946 0.058607 -1.330 0.2001
## X2 -0.073399 0.088924 -0.825 0.4199
## X3 0.121115 0.091353 1.326 0.2015
## X4 1.329034 3.099535 0.429 0.6732
## X5 5.975989 3.158647 1.892 0.0747 .
## X6 0.304178 1.289094 0.236 0.8161
## X7 -3.198576 3.105435 -1.030 0.3167
## X8 0.185362 0.129252 1.434 0.1687
## X9 -0.399146 0.323812 -1.233 0.2336
## X10 -0.005193 0.005893 -0.881 0.3898
## X11 0.598655 3.020681 0.198 0.8451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.226 on 18 degrees of freedom
## Multiple R-squared: 0.8353, Adjusted R-squared: 0.7346
## F-statistic: 8.297 on 11 and 18 DF, p-value: 5.287e-05
# a)
corrplot(cor(data))
pairs(data)
# b)
a <- eigen(cor(data))
cov(data)
## Y X1 X2 X3 X4
## Y 39.2218506 -633.53025 -222.746782 -441.864483 0.73354023
## X1 -633.5302529 13463.38257 4873.397011 9538.342759 -11.24742529
## X2 -222.7467816 4873.39701 1993.688506 3576.927586 -3.58919540
## X3 -441.8644828 9538.34276 3576.927586 6900.575862 -7.50896552
## X4 0.7335402 -11.24743 -3.589195 -7.508966 0.07688506
## X5 2.1048379 -41.21930 -13.015897 -29.572862 0.06070345
## X6 -3.1702299 79.65080 36.467816 58.196552 0.01114943
## X7 2.9298851 -59.18391 -18.471264 -40.965517 0.10229885
## X8 -99.0993793 2110.59152 753.811724 1509.655862 -1.77365517
## X9 -27.1150345 527.21145 180.148966 371.767586 -0.59544828
## X10 -5024.2255172 104064.50276 37304.234483 73748.531034 -93.52655172
## X11 -2.0328736 43.01057 14.232184 29.937931 -0.05494253
## X5 X6 X7 X8 X9
## Y 2.10483793 -3.17022989 2.9298851 -99.099379 -27.1150345
## X1 -41.21929655 79.65080460 -59.1839080 2110.591517 527.2114483
## X2 -13.01589655 36.46781609 -18.4712644 753.811724 180.1489655
## X3 -29.57286207 58.19655172 -40.9655172 1509.655862 371.7675862
## X4 0.06070345 0.01114943 0.1022989 -1.773655 -0.5954483
## X5 0.27992655 -0.12458621 0.3048276 -6.245993 -1.3623241
## X6 -0.12458621 1.15057471 -0.1954023 9.521379 1.8296552
## X7 0.30482759 -0.19540230 0.4367816 -9.106897 -2.4586207
## X8 -6.24599310 9.52137931 -9.1068966 442.303172 105.4695172
## X9 -1.36232414 1.82965517 -2.4586207 105.469517 32.2451034
## X10 -288.68296552 525.22068966 -438.9310345 18907.956552 4806.1110345
## X11 -0.17958621 0.19080460 -0.2528736 6.455862 1.6158621
## X10 X11
## Y -5024.22552 -2.03287356
## X1 104064.50276 43.01057471
## X2 37304.23448 14.23218391
## X3 73748.53103 29.93793103
## X4 -93.52655 -0.05494253
## X5 -288.68297 -0.17958621
## X6 525.22069 0.19080460
## X7 -438.93103 -0.25287356
## X8 18907.95655 6.45586207
## X9 4806.11103 1.61586207
## X10 885420.23448 318.70344828
## X11 318.70345 0.20229885
eigen(cov(data))
## eigen() decomposition
## $values
## [1] 9.058710e+05 2.124308e+03 2.076657e+02 5.367467e+01 2.382791e+01
## [6] 8.368886e+00 4.433384e+00 3.258455e-01 1.439259e-01 3.822721e-02
## [11] 3.650523e-02 1.800703e-02
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.0056178830 2.229226e-02 -0.054817498 -0.098670478 -0.164314757
## [2,] -0.1164517587 -7.217035e-01 0.566609525 0.280705430 -0.249377051
## [3,] -0.0417763888 -3.587836e-01 -0.780424041 0.473584282 -0.172423759
## [4,] -0.0825455661 -5.685213e-01 -0.241359782 -0.708099716 0.326999367
## [5,] 0.0001044153 -5.536026e-05 -0.002507735 -0.003926193 -0.006012880
## [6,] 0.0003238150 4.057741e-03 -0.010361544 0.027062628 0.007096465
## [7,] -0.0005906769 -1.236297e-02 -0.021201174 0.045000676 -0.034159546
## [8,] 0.0004914657 3.696165e-03 -0.015946721 0.015942301 0.005798218
## [9,] -0.0210904004 6.709978e-02 -0.086090814 -0.424702716 -0.874918456
## [10,] -0.0053579263 2.516968e-02 0.013604142 -0.046195979 0.081613473
## [11,] -0.9886222960 1.462115e-01 -0.012161654 0.014787337 0.026672683
## [12,] -0.0003569034 -2.892930e-03 0.008391378 -0.005400884 -0.003421294
## [,6] [,7] [,8] [,9] [,10]
## [1,] 0.949192121 -0.2352825365 0.0517892126 -0.012625269 -0.0151865323
## [2,] 0.023435324 -0.0503436753 0.0100333989 0.018160245 -0.0048406047
## [3,] -0.032537667 -0.0512527640 0.0502881874 0.005638560 -0.0035029005
## [4,] -0.003854153 0.0453883804 -0.0324257236 -0.004335881 0.0066138849
## [5,] 0.018586323 0.0104623094 -0.2688862921 0.210299806 -0.0308143390
## [6,] 0.036420280 -0.0331147768 -0.2742914857 0.438357607 0.3975311628
## [7,] 0.039661341 0.0169109784 -0.8256774410 -0.546317756 0.0230881273
## [8,] 0.028042523 0.0347722963 -0.3634378643 0.635675289 0.1063499738
## [9,] -0.202200241 -0.0050934648 -0.0094255182 0.023229649 0.0065131999
## [10,] -0.228874462 -0.9667595438 -0.0384148358 0.002225951 -0.0059461676
## [11,] 0.009889536 0.0083149299 0.0002352226 -0.001698114 -0.0001896707
## [12,] -0.003740957 -0.0006725115 0.1752321565 -0.244914673 0.9103780434
## [,11] [,12]
## [1,] 0.0050136088 -1.762077e-02
## [2,] 0.0009860515 -3.707277e-03
## [3,] 0.0139938640 -1.020335e-03
## [4,] -0.0125004029 6.015614e-03
## [5,] 0.8941235825 2.873236e-01
## [6,] -0.3817413725 6.523186e-01
## [7,] -0.1018578563 -5.685040e-02
## [8,] -0.0409106819 -6.695479e-01
## [9,] -0.0134034805 5.007287e-03
## [10,] 0.0168311107 -3.938333e-02
## [11,] 0.0004954463 -5.575157e-08
## [12,] 0.2047595054 -1.961164e-01
## [1] 2634.889
#condition_number > 1000, thus, severe multicollinearity is present
# c)
library(car)
vif(model)
## X1 X2 X3 X4 X5 X6
X7
## 128.834832 43.921063 160.436093 2.057834 7.780750 5.326714
11.735038
## X8 X9 X10 X11
## 20.585810 9.419449 85.675755 5.142547
# Q2
library(glmnet)
plot(b)
##
## Call: cv.glmnet(x = x, y = y)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.1885 37 13.44 2.690 5
## 1se 1.9292 12 15.97 4.654 2
summary(model2)
## Length Class Mode
## a0 100 -none- numeric
## beta 1100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
plot(model2$dev.ratio)
#cross validation
model3 <- cv.glmnet(x,y,alpha = 0) #6.03 is the minimum lambda
L <- model3$lambda.min
model4 <- glmnet(x,y,alpha = 0, lambda = L)
model4 #r2 = 75.32
##
## Call: glmnet(x = x, y = y, alpha = 0, lambda = L)
##
## Df %Dev Lambda
## 1 11 75.09 6.618
YPred <- predict(model4,s = L, newx = x)
SST <- sum((data$Y- mean(data$Y))^2)
SSE <- sum((YPred - data$Y)^2); SSE
## [1] 283.354
## [1] 0.750883
# Q3
# install.packages("Metrics")
library(Metrics)
# install.packages("pls")
library(pls)
##
## Attaching package: 'pls'
## Data: X dimension: 30 11
## Y dimension: 30 1
## Fit method: svdpc
## Number of components considered: 11
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 6.37 3.487 3.510 3.531 3.690 3.818 4.006
## adjCV 6.37 3.465 3.486 3.505 3.656 3.776 3.957
## 7 comps 8 comps 9 comps 10 comps 11 comps
## CV 4.079 4.224 4.527 4.822 4.699
## adjCV 4.021 4.175 4.427 4.705 4.580
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 70.02 82.78 89.81 95.06 96.98 98.27 99.13 99.59
## Y 75.44 75.45 75.68 75.82 75.95 75.95 77.65 78.06
## 9 comps 10 comps 11 comps
## X 99.89 99.97 100.00
## Y 80.97 82.23 83.53
## [1] 3.036787
library(psych)
##
## Attaching package: 'psych'
summary(pc_fit)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
PC7
## Standard deviation 2.7754 1.1845 0.87945 0.75964 0.45989 0.3768
0.30845
## Proportion of Variance 0.7002 0.1275 0.07031 0.05246 0.01923 0.0129
0.00865
## Cumulative Proportion 0.7002 0.8278 0.89810 0.95056 0.96979 0.9827
0.99134
## PC8 PC9 PC10 PC11
## Standard deviation 0.22381 0.18239 0.09175 0.05914
## Proportion of Variance 0.00455 0.00302 0.00077 0.00032
## Cumulative Proportion 0.99589 0.99892 0.99968 1.00000
transformed
##
## Call:
## lm(formula = Y ~ ., data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1217 -1.9285 -0.0179 1.7704 7.3854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.04333 0.58714 34.137 < 2e-16 ***
## PC1 1.95997 0.21517 9.109 1.01e-09 ***
## PC2 0.05003 0.50416 0.099 0.922
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.216 on 27 degrees of freedom
## Multiple R-squared: 0.7545, Adjusted R-squared: 0.7363
## F-statistic: 41.49 on 2 and 27 DF, p-value: 5.83e-09
vif(pcr_lm)
## PC1 PC2
## 1 1
anova(pcr_lm)
# Practical 5
# Q1
library(MPV)
data <- table.b2
data
## y x1 x2 x3 x4 x5
## 1 271.8 783.35 33.53 40.55 16.66 13.20
## 2 264.0 748.45 36.50 36.19 16.46 14.11
## 3 238.8 684.45 34.66 37.31 17.66 15.68
## 4 230.7 827.80 33.13 32.52 17.50 10.53
## 5 251.6 860.45 35.75 33.71 16.40 11.00
## 6 257.9 875.15 34.46 34.14 16.28 11.31
## 7 263.9 909.45 34.60 34.85 16.06 11.96
## 8 266.5 905.55 35.38 35.89 15.93 12.58
## 9 229.1 756.00 35.85 33.53 16.60 10.66
## 10 239.3 769.35 35.68 33.79 16.41 10.85
## 11 258.0 793.50 35.35 34.72 16.17 11.41
## 12 257.6 801.65 35.04 35.22 15.92 11.91
## 13 267.3 819.65 34.07 36.50 16.04 12.85
## 14 267.0 808.55 32.20 37.60 16.19 13.58
## 15 259.6 774.95 34.32 37.89 16.62 14.21
## 16 240.4 711.85 31.08 37.71 17.37 15.56
## 17 227.2 694.85 35.73 37.00 18.12 15.83
## 18 196.0 638.10 34.11 36.76 18.53 16.41
## 19 278.7 774.55 34.79 34.62 15.54 13.10
## 20 272.3 757.90 35.77 35.40 15.70 13.63
## 21 267.4 753.35 36.44 35.96 16.45 14.51
## 22 254.5 704.70 37.82 36.26 17.62 15.38
## 23 224.7 666.80 35.07 36.34 18.12 16.10
## 24 181.5 568.55 35.26 35.90 19.05 16.73
## 25 227.5 653.10 35.56 31.84 16.51 10.58
## 26 253.6 704.05 35.73 33.16 16.02 11.28
## 27 263.0 709.60 36.46 33.83 15.89 11.91
## 28 265.8 726.90 36.26 34.89 15.83 12.65
## 29 263.8 697.15 37.20 36.27 16.71 14.06
##
## Call:
## lm(formula = y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6848 -2.7688 0.6273 3.9166 17.3962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 325.43612 96.12721 3.385 0.00255 **
## x1 0.06753 0.02899 2.329 0.02900 *
## x2 2.55198 1.24824 2.044 0.05252 .
## x3 3.80019 1.46114 2.601 0.01598 *
## x4 -22.94947 2.70360 -8.488 1.53e-08 ***
## x5 2.41748 1.80829 1.337 0.19433
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.039 on 23 degrees of freedom
## Multiple R-squared: 0.8988, Adjusted R-squared: 0.8768
## F-statistic: 40.84 on 5 and 23 DF, p-value: 1.077e-10
# install.packages("olsrr")
library(olsrr)
##
## Attaching package: 'olsrr'
# a)
forward <- ols_step_forward_p(model, details = T, p_val = 0.1)
# b)
backward <- ols_step_backward_p(model, details = T, p_val = 0.1)
# c)
stepwise <- ols_step_both_p(model, details = T, p_val = 0.1)
# d)
all_models <- ols_step_all_possible(model)
all_models
# e)
# Best model is x1+x2+x3+x4
models_best <- lm(y ~ x1 + x2 + x3 + x4, data = data)
summary(models_best)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.322 -2.639 0.025 4.786 16.003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 270.21013 88.21060 3.063 0.00534 **
## x1 0.05156 0.02685 1.920 0.06676 .
## x2 2.95141 1.23167 2.396 0.02471 *
## x3 5.33861 0.91506 5.834 5.13e-06 ***
## x4 -21.11940 2.36936 -8.914 4.42e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.17 on 24 degrees of freedom
## Multiple R-squared: 0.8909, Adjusted R-squared: 0.8727
## F-statistic: 48.99 on 4 and 24 DF, p-value: 3.327e-11
models_best
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4
## 270.21013 0.05156 2.95141 5.33861 -21.11940
ols_mallows_cp(models_best, model)
## [1] 5.787266
# cp = 5.78 which is approx equal to 5, hence the model with x1, x2, x3, x4
is the best.
all_models$result
# Q2)
# install.packages("MPV")
library(MPV)
data <- table.b15
View(data)
# install.packages("olsrr")
library(olsrr)
model <- lm(Mort ~ Precip + Educ + Nonwhite + Nox + SO2, data = data)
summary(model)
##
## Call:
## lm(formula = Mort ~ Precip + Educ + Nonwhite + Nox + SO2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.523 -19.628 -3.563 15.873 91.835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 996.42015 91.66874 10.870 3.29e-15 ***
## Precip 1.41075 0.69032 2.044 0.045884 *
## Educ -14.88048 7.01907 -2.120 0.038617 *
## Nonwhite 3.19373 0.62171 5.137 3.94e-06 ***
## Nox -0.10524 0.13476 -0.781 0.438272
## SO2 0.35407 0.09076 3.901 0.000267 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.1 on 54 degrees of freedom
## Multiple R-squared: 0.6744, Adjusted R-squared: 0.6442
## F-statistic: 22.37 on 5 and 54 DF, p-value: 4.47e-12
# Adjusted r2 = 0.6442
all_models$result
# the best model is mod ~ preci + educ + nonwhite + so2 as the adjusted r2
value is
# the highest and rmse is the lowest
##
## Call:
## lm(formula = Mort ~ Precip + Educ + Nonwhite + SO2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.600 -20.499 -2.443 17.891 92.521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 995.82238 91.33980 10.902 2.31e-15 ***
## Precip 1.63505 0.62550 2.614 0.011522 *
## Educ -15.56968 6.93862 -2.244 0.028883 *
## Nonwhite 3.09979 0.60779 5.100 4.33e-06 ***
## SO2 0.32634 0.08323 3.921 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.97 on 55 degrees of freedom
## Multiple R-squared: 0.6707, Adjusted R-squared: 0.6468
## F-statistic: 28.01 on 4 and 55 DF, p-value: 1.052e-12
# Adjusted R2 = 0.6468
ols_mallows_cp(models_best, model)
## [1] 4.609803
# Mallows CP = 4.609
# Q3
data <- table.b11
str(data)
##
## Call:
## lm(formula = Quality ~ Clarity + Aroma + Body + Flavor + Oakiness +
## Region, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80824 -0.58413 -0.02081 0.48627 1.70909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.81437 1.96944 3.968 0.000417 ***
## Clarity 0.01705 1.45627 0.012 0.990736
## Aroma 0.08901 0.25250 0.353 0.726908
## Body 0.07967 0.26772 0.298 0.768062
## Flavor 1.11723 0.24026 4.650 6.25e-05 ***
## Oakiness -0.34644 0.23301 -1.487 0.147503
## Region2 -1.51285 0.39227 -3.857 0.000565 ***
## Region3 0.97259 0.51017 1.906 0.066218 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9154 on 30 degrees of freedom
## Multiple R-squared: 0.8376, Adjusted R-squared: 0.7997
## F-statistic: 22.1 on 7 and 30 DF, p-value: 3.295e-10
# Adjusted R2 = 0.7997
all_models$result
##
## Call:
## lm(formula = Quality ~ Flavor + Oakiness + Region, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81290 -0.59794 0.03423 0.42452 1.71484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.1208 1.0164 7.990 3.23e-09 ***
## Flavor 1.1920 0.1772 6.727 1.15e-07 ***
## Oakiness -0.3183 0.2039 -1.561 0.128060
## Region2 -1.5155 0.3614 -4.193 0.000194 ***
## Region3 1.0935 0.4009 2.728 0.010130 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8763 on 33 degrees of freedom
## Multiple R-squared: 0.8363, Adjusted R-squared: 0.8164
## F-statistic: 42.14 on 4 and 33 DF, p-value: 1.595e-12
ols_mallows_cp(models_best, model)
## [1] 2.240659
# Q4
m4=lm(y~.,data=table.b14)
all_models=ols_step_all_possible(m4)
View(all_models$result)
#best model is : y ~ x1 + x2 + x3 + x4
m4.1=lm(y~x1 + x2 + x3 + x4,data=table.b14)
plot(m4.1)
#errors do not have constant variance
data=table.b14[-2,]
t14=data.frame(y_=log(data$y),x1_=1/sqrt(data$x1),
x2_=sqrt(data$x2),x3_=1/sqrt(data$x3),
x4_=sqrt(data$x4),x5_=data$x5)
View(t14)
m4=lm(y_~.,data=t14)
#a)
all_models=ols_step_all_possible(m4)
View(all_models$result)
#b)
m4.2=lm(y_~x1_+x2_ + x3_ + x4_,data=t14)
par(mfrow=c(2,2))
plot(m4.2)