Cb161 Lab Manual
Cb161 Lab Manual
OF
CB – 161(R-18)
STATISTICALMETHODS LAB with R MANUAL
I/IV B.Tech - CSBS (2nd – Semester)
0
LABORATORY MANUAL
1
1. Course Objectives:
1. The knowledge to use R for statistical programming, computation, modelling and graphics.
2. The skill to write functions and use R in an efficient way.
3. The ability to fit some basic types of statistical models using R.
4. The idea to expand the knowledge of R on their own.
2.Course Outcomes
On completion of this course, students will be able to:
Introduction to R
Functions
Control flow and Loops
Working with Vectors and Matrices
Reading in Data
Writing Data
Working with Data
Manipulating Data
Simulation
Linear model
Data Frame
Graphics in R
4. TEXT BOOKS :
2
5. PRE – REQUISITES
CB151– C Programming.
2.Measures of dispersion
a)Range b)Quartile deviation c)Mean deviation d)Standard deviation 3
e) Coefficient of Variation
LAB CYCLE-II
4.Curve fitting
a)Straight line b)Parabola c) Y=aXb d) Y=abX e)Y=aebX 6
5.ANOVA
a)one-way classification b)two-way classification 3
6.Time series
a)Moving averages b)ARIMA 3
LAB CYCLE-III
7.Goodness of fit
a)Binomial b)Poisson c)Normal 6
8.Parametric tests
a) t-test for one-mean b) t-test for two means c) paired t-test d) F-test 6
3
7. EVALUATION METHODS :
PO1 PO2 PO3 PO4 PO5 PO6 PO7 PO8 PO9 PO10 PO11 PO12
CO1 3 3 3
CO2 2 2 2
CO3 3 3 2
CO4 3 2 3
4
SOLUTIONS
#median
print("Enter values in a vector")
x<-scan()
n <- length(x)
for(i in 1:(n-1))
for(j in (i+1):n)
if (x[i] > x[j])
{
temp <- x[i]
x[i] <- x[j]
x[j] <- temp
}
print(x)
if(n%%2==0){
med =(x[n%/%2]+x[(n%/%2)+1])/2.0
} else
med =x[(n+1)%/%2]
print(med)
#mode
print("ENTER THE NUMBER OF ELEMENTS")
x<-scan()
n<-length(x)
c<-vector()
for(i in 1:(n-1))
{
c[i]<-1
c[n]<-1
for(j in (i+1):n)
if(x[i]==x[j]) c[i]<-c[i]+1
}
print(c)
5
big=c[1];
pos=1;
for(i in 2:n)
if(big<c[i])
{
big=c[i]
pos=i
}
for(i in 1:n)
if(big==c[i])
{
if(x[pos]<x[i])
pos=i
}
print("The MODE using program is")
print(x[pos])
print("The MODE using built in function is")
print(max(x[duplicated(x)]))
# Geometric Mean
print("Enter values in an array")
x<- scan()
n<-length(x)
prod<-1
for(i in 1:n)
prod<-prod*x[i]
gm<-prod^(1/n)
print("Geometric Mean")
print(gm)
print("GM using built in function")
print(exp(mean(log(x))))
# Harmonic Mean
print("Enter values in an array")
x<- scan()
n<-length(x)
sum<-0
for(i in 1:n)
{
sum=sum+(1/x[i])
}
hm<-n/sum
print("Harmonic mean of the given values")
print(hm)
print("Harmonic mean value using built in function")
print(n/sum((1/x)))
6
Experiment No. 2: Measures of dispersion
a)Range b)Quartile deviation c)Mean deviation d)Standard deviation e)Coeff. of Variation
SOLUTION :
#Range
print("Enter values in a vector")
x<-scan()
n <- length(x)
for(i in 1:(n-1))
for(j in (i+1):n)
if (x[i] > x[j])
{
temp <- x[i]
x[i] <- x[j]
x[j] <- temp
}
print(x)
range<-x[n]-x[1]
print(" range of the given data")
print(range)
print(" range of the given data using built in ")
print(range(x))
# another way
big<-x[1]
small<-x[1]
for(i in 2:n)
{
if (big<x[i]) big<-x[i]
if (small>x[i]) small<-x[i]
}
print(big)
print(small)
rang<-big-small
print(rang)
#Quartile deviation
print("Enter values in a vector")
x<-scan()
n <- length(x)
for(i in 1:(n-1))
for(j in (i+1):n)
if (x[i] > x[j])
{
temp <- x[i]
x[i] <- x[j]
x[j] <- temp
}
print(x)
if(n%%2==0){
q3<-x[(3*n)%/%4]
print(q3)
7
q1<-x[n%/%4]
print(q1)
qd<-(q3-q1)/2
} else if((n+1)%%4==0) {
q3<-x[(3*(n+1))%/%4]
print(q3)
q1<-x[(n+1)%/%4]
print(q1)
qd<-(q3-q1)/2
} else {
q3<-(x[((3*(n+1))%/%4)]+x[((3*(n+1))%/%4)+1])/2
print(q3)
q1<-(x[((n+1)%/%4)]+x[((n+1)%/%4)+1])/2
print(q1)
qd<-(q3-q1)/2
}
print(" Quartile deviation using program:")
print(qd)
quart<-function(x)
{
x <- sort(x)
n <- length(x)
m <- (n+1)/2
if (floor(m) != m) {
l <- m-1/2; u <- m+1/2
} else {
l <- m-1; u <- m+1
}
qrt3<-median(x[u:n])
print(qrt3)
qrt1<-median(x[1:l])
print(qrt1)
quartdev<-(qrt3-qrt1)/2
}
print(" Quartile deviation using built ins:")
print(quart(x))
# Mean deviation
print("Enter values in an array")
x<- scan()
n<-length(x)
sumx<-0
sumdev<-0
for(i in 1:n)
sumx=sumx+x[i]
mean<-sumx/n
print(mean)
for(i in 1:n)
sumdev=sumdev+abs(x[i]-mean)
print(sumdev)
md=sumdev/n
print("Mean deviation")
8
print(md)
print("md value using built in function")
result<-mad(x)
print(result)
# Standard deviation
print("Enter values in an array")
x<- scan()
n<-length(x)
sumx<-0
sumxx<-0
for(i in 1:n)
{
sumx=sumx+x[i]
sumxx=sumxx+(x[i]*x[i])
}
mean<-sumx/n
standev=sqrt((sumxx/n)-(mean*mean))
print("Standard deviation")
print(standev)
print("SD value using built in function")
print(sd(x))
# Coefficient of Variation
print("Enter values in an array")
x<- scan()
n<-length(x)
sumx<-0
sumsqdev<-0
for(i in 1:n)
sumx=sumx+x[i]
mean<-sumx/n
for(i in 1:n)
sumsqdev<-sumsqdev+((x[i]-mean)^2)
sd<-sqrt(sumsqdev/n)
print("Standard deviation is")
print(sd)
print("mean value is ")
print(mean)
cv=(sd/mean)*100
print("Coefficient of variation")
print(cv)
print("Coefficient of variation using built-ins")
print((sd(x)/mean(x))*100)
9
Experiment No. 3: Correlation & Regression
a)Correlation coefficient b)Regression lines c)Rank Correlation
d)Multiple correlation coefficient e)Multiple linear regression
SOLUTION:
#corcof
x<-c(7,9,4,10,6,7,8,8,5,6)
y<-c(6,8,6,10,8,5,10,7,7,8)
n<-length(x)
xy<-x*y
xx<-x*x
yy<-y*y
mydata<- data.frame(x,y,xy,xx,yy)
#print(mydata)
sums<-list(sum(x),sum(y),sum(x*y),sum(x*x),sum(y*y))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
meanx<-sum(x)/n
print(meanx)
meany<-sum(y)/n
print(meany)
cov<-(sum(x*y)/n)-(meanx*meany)
print(cov)
sdx<-sqrt((sum(x*x)/n)-(meanx^2))
print(sdx)
sdy<-sqrt((sum(y*y)/n)-(meany^2))
print(sdx)
corcof<-cov/sdx/sdy
print(" Correlation coefficient using program")
print(round(corcof,digits=4))
print(" Correlation coefficient using built in")
print(cor(x,y))
plot(x,y)
#Regression Lines
x<-c(7,9,4,10,6,7,8,8,5,6)
y<-c(6,8,6,10,8,5,10,7,7,8)
n<-length(x)
xy<-x*y
xx<-x*x
yy<-y*y
mydata<- data.frame(x,y,xy,xx,yy)
#print(mydata)
sums<-list(sum(x),sum(y),sum(x*y),sum(x*x),sum(y*y))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
print("regression line x on y")
result1<-lm(x~y)
print(result1)
print("regression line y on x")
result2<-lm(y~x)
print(result2)
10
# to find the value of x when y=23
x<-coef(result1)[1] + coef(result1)[2]*23
print(x)
# to find the value of y when x=45
y<-coef(result2)[1] + coef(result2)[2]*45
print(y)
#multicorcof
x<-c(7,9,4,10,6,7,8,8,5,6)
y<-c(6,8,6,10,8,5,10,7,7,8)
z<-c(1,2,3,4,5,6,9,7,8,9)
n<-length(x)
corcof<- function(x,y)
{
xy<-x*y
xx<-x*x
yy<-y*y
mydata<- data.frame(x,y,xy,xx,yy)
sums<-list(sum(x),sum(y),sum(x*y),sum(x*x),sum(y*y))
mydata<-rbind(mydata,sums)
cat("\n")
print(mydata,row.names=FALSE)
meanx<-sum(x)/n
meany<-sum(y)/n
cov<-(sum(x*y)/n)-(meanx*meany)
sdx<-sqrt((sum(x*x)/n)-(meanx^2))
sdy<-sqrt((sum(y*y)/n)-(meany^2))
corcof<-cov/sdx/sdy
print(" Correlation coefficient using program")
print(round(corcof,digits=4))
}
r12<-corcof(x,y)
11
r23<-corcof(y,z)
r13<-corcof(x,z)
cat("\n\n\n")
print("partial correlation coefficient")
pcof<-(r12-(r13*r23))/(sqrt(1-(r13^2))*sqrt(1-(r23^2)))
print(pcof)
print("multiple correlation coefficient")
mcof<-sqrt((r12^2+r13^2-2*r12*r13*r23)/(1-r23^2))
print(mcof)
#multipleregline
x1<-c(3,5,6,8,12,14)
x2<-c(16,10,7,4,3,2)
x3<-c(90,72,54,42,30,12)
x1<-c(37,45,38,42,31)
x2<-c(4,0,5,2,4)
x3<-c(71200,66800,75000,70300,65400)
n<-length(x)
x1x2<-x1*x2
x1x3<-x1*x3
x2x3<-x2*x3
x1x1<-x1*x1
x2x2<-x2*x2
mydata<- data.frame(x1,x2,x3,x1x2,x1x3,x2x3,x1x1,x2x2)
print(mydata)
sums<-
list(sum(x1),sum(x2),sum(x3),sum(x1*x2),sum(x1*x3),sum(x2*x3),sum(x1*x1),s
um(x2*x2))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
result1<-lm(x3~x1+x2)
print(result1)
12
Experiment No. 4: Curve fitting
a)Straight line b)Parabola c) Y=aXb d) Y=abX e)Y=aebX
SOLUTION:
#Parabola fit
x<-c(0,1,2,3,4)
y<-c(1,1.8,1.3,2.5,6.3)
n<-length(x)
xy<-x*y
xx<-x*x
xxx<-x^3
xxxx<-x^4
xxy<-x^2*y
mydata<- data.frame(x,y,xy,xx,xxx,xxxx,xxy)
print(mydata)
sums<-list(sum(x),sum(y),sum(x*y),sum(x*x),sum(x^3),sum(x^4),sum(x^2*y))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
parabola <- lm(y ~ x+I(x^2))
print(parabola)
f<-coef(parabola)[1]+((coef(parabola)[2])*x)+((coef(parabola)[3])*x*x)
print(f)
plot(x,y)
curve((coef(parabola)[1]+(coef(parabola)[2]*x)+(coef(parabola)
[3]*x*x)),from=x[1],n=x[n],add=T)
curve(predict(parabola,newdata=data.frame(x)),add=T)
13
#a x power b
x<-c(1,2,3,4,6,8)
y<-c(2.4,3,3.6,4,5,6)
n<-length(x)
logx<-round(log10(x),digits=4)
logy<-round(log10(y),digits=4)
logxlogy<-round(logx*logy,digits=4)
logxlogx<-round(logx*logx,digits=4)
mydata<-data.frame(logx,logy,logxlogy,logxlogx)
colnames(mydata)=c("X=logx","Y=logy","XY","XX")
print(mydata)
sums<-
list(sum(logx),sum(logy),round(sum(logx*logy),digits=4),round(sum(logx*log
x),digits=4))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
power<-lm(log10(y)~log10(x))
print(power)
alpha<-10^(coef(power)[1])
beta<-coef(power)[2]
print(round(alpha,digits=4))
print(round(beta,digits=4))
f<-alpha*(x^beta)
print(f)
plot(x,y)
curve(alpha*(x^beta),from=x[1],n=x[n],add=T)
#Power fit(a*b^x)
x<-c(1,1.5,2,2.5,3,3.5,4)
y<-c(1,1.3,1.6,2,2.7,3.4,4.1)
n<-length(x)
logy<-round(log10(y),digits=4)
xlogy<-round(x*logy,digits=4)
xx<-x*x
mydata<-data.frame(x,logy,xlogy,xx)
colnames(mydata)=c("X=x","Y=logy","XY","XX")
#print(mydata)
sums<-list(sum(x),sum(logy),sum(x*logy),sum(x*x))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
power<-lm(log10(y)~x)
print(power)
alpha<-10^(coef(power)[1])
beta<-10^(coef(power)[2])
print(alpha)
print(beta)
f<-alpha*(beta^x)
print(f)
plot(x,y)
curve(alpha*(beta^x),from=x[1],n=x[n],add=T)
14
#Power fit(a*e^(b*x))
x<-c(1,2,3,4,5)
y<-c(1.8,5.1,8.9,14.1,19.8)
n<-length(x)
logy<-round(log10(y),digits=4)
xlogy<-round(x*logy,digits=4)
xx<-x*x
mydata<-data.frame(x,logy,xlogy,xx)
colnames(mydata)=c("X=x","Y=logy","XY","XX")
print(mydata)
sums<-list(sum(x),sum(logy),sum(x*logy),sum(x*x))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
power<-lm(log10(y)~x)
print(power)
alpha<-10^(coef(power)[1])
beta<-coef(power)[2]/0.4343
print(alpha)
print(beta)
f<-alpha*exp((beta^x))
print(f)
plot(x,y)
curve((alpha*exp((beta^x))),from=x[1],n=x[n],add=T)
15
Experiment No. 5: ANOVA
a)one-way classification b)two-way classification
SOLUTION:
16
Experiment No. 6: Time series
a)Moving averages b)ARIMA
SOLUTION:
17
#Five yearly moving Average(ARIMA)
year<-
c(1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,19
80,1981,1982)
timeseries<-
c(19.3,20.9,17.8,16.1,17.6,17.8,18.3,17.3,21.4,19.3,18.1,19.5,19.2,22.2,20
.9,21.5,21.9)
Four yearly moving Average
year<-c(1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982)
timeseries<-c(2204,2500,2360,2680,2424,2634,2904,3098,3172,2952,3248,3172)
print("enter the length of moving average")
n<-scan()
if(n%%2==0){
ma<-filter(timeseries,rep(1/n,n), sides=2)
adjma<-filter(ma,rep(1/2,2))
mydata<-data.frame(year,timeseries,ma,adjma)
print(mydata)
u= adjma[!is.na(adjma)]
l<-length(u)
utminus1<-u[1:l-1]
print(utminus1)
ut<-u[2:l]
print(ut)
} else{
ma<-filter(timeseries,rep(1/n,n), sides=2)
mydata<-data.frame(year,timeseries,ma)
print(mydata)
u= ma[!is.na(ma)]
l<-length(u)
utminus1<-u[1:l-1]
print(utminus1)
ut<-u[2:l]
print(ut)
}
plot(year,timeseries, type = "l", col = 1, xlim=c(1965,1983), ylim =
c(16,23),
main = "Moving averages", xlab = "year", ylab = "values")
lines(year,y1, col="blue")
plot(year,timeseries, type = "l", col = 1, xlim=c(1970,1983), ylim =
c(2000,3500),
main = "Moving averages", xlab = "year", ylab = "values")
lines(year,y1, col="blue")
18
Experiment No. 7: Goodness of fit
a)Binomial b)Poisson c)Normal d)Contingency table
SOLUTION:
19
# Goodness of fit for ND
x<-c(60,65,70,75,80,85,90,95,100)
y<-c(65,70,75,80,85,90,95,100,105)
f<-c(0,3,21,150,335,326,135,26,4)
xi<-(x+y)/2
fx<-f*xi
fxx<-f*xi*xi
sumfx<-sum(f*xi)
print(sumfx)
sumfxx<-sum(f*xi*xi)
sumf<-sum(f)
print(sumf)
m<-sumfx/sumf
print(m)
sd<-(sumfxx/sumf)-(m*m)
sd<-sqrt(sd)
print(sd)
u<-pnorm(y,m,sd)
l<-pnorm(x,m,sd)
pr<-u-l
u<-round(u,digits=5)
l<-round(l,digits=5)
pr<-round(pr,digits=5)
fee<-(pr*sumf)
fe<-round(fee,digits=0)
mydata<- data.frame(x,y,xi,f,fx,fxx,u,l,pr,fee,fe)
print(mydata)
sums<-list(NA,NA,NA,sum(f),sum(f*xi),sum(f*xi*xi),NA,NA,NA,NA,sum(fe))
mydata<-rbind(mydata,sums)
print(mydata,row.names=FALSE)
result<-chisq.test(f,p=pr,rescale.p=TRUE)
print(result)
20
# Goodness of fit contingency table
m<-as.table(rbind(c(190,243,197),c(82,44,44),c(23,78,34),c(5,12,8)))
dimnames(m)=list(Empcategory=c("Labour","Clerks","Technicians","Executives
"), BonusSchemes=c("Type1","Type2","Type3"))
print(m)
csum<-colSums (m)
rsum<-rowSums (m)
mytable<-(rbind(m,csum))
print(mytable)
mytable<-(cbind(m,rsum))
print(mytable)
test<-chisq.test(m)
print(test)
print(test$expected,3)
21
Experiment No. 8: Parametric tests
a) t-test for one-mean b) t-test for two means c) paired t-test d) F-test
SOLUTION:
#two-means- SST
#x<-c(8260, 8130, 8350, 8070, 8340)
#y<-c(7950, 7890, 7900, 8140, 7920, 7840)
x<-c(59, 68, 44, 71, 63, 46, 69, 54, 48)
y<-c(50, 36, 62, 52, 70, 41)
print("enter the level of significance")
alpha<-scan()
n1<-length(x)
n2<-length(y)
sd=sqrt((((n1-1)*sd(x)^2)+(n2-1)*sd(y)^2)/(n1+n2-2))
tvalue=(mean(x)-mean(y))/(sd*sqrt((1/n1)+(1/n2)))
print("mean of x:")
print(mean(x))
print("mean of y:")
print(mean(y))
print("Combined SD")
print(sd)
print("Calculated value of t:")
print(round(tvalue,digits=4))
#t<-t.test(x,y)
#print(t)
print("Table value for two-tailed test:")
tablevalue<-qt(1-alpha/2, df=n1+n2-2)
print(round(tablevalue,digits=3))
print("Table value for one-tailed test:")
tablevalue<-qt(1-alpha, df=n1+n2-2)
print(round(tablevalue,digits=3))
22
#paired t-test for two-means- SST
x<-c(45, 73, 46, 124, 33, 57, 83, 34, 26, 17)
y<-c(36, 60, 44, 119, 35, 51, 77, 29, 24, 11)
print("enter the level of significance")
alpha<-scan()
d<-x-y
n<-length(d)
dbar<-mean(d)
std<-sd(d)
values<-c(dbar,std)
print("values of dbar and standard deviation of differences")
print(round(values,digits=4))
t<-t.test(x,y,paired=TRUE)
print(t)
print("Table value for two-tailed test:")
tablevalue<-qt(1-alpha/2, df=n-1)
print(round(tablevalue,digits=4))
print("Table value for one-tailed test:")
tablevalue<-qt(1-alpha, df=n-1)
print(round(tablevalue,digits=3))
23
Experiment No. 9: Graphical representation of data
a) Bar plot b)Frequency polygon c)Histogram d)Pie chart e) scatter plot
SOLUTION
#Bar plot
H <- c(5,15,17,18,16,15)
M <- c(1980,1981,1982,1983,1984,1985)
barplot(H,xlab="Year",ylab="Profit",ylim=c(0,20),
col=rainbow(6),names.arg=M, main="RVRJC
PHARMACEUTICALFIRM",border="red")
#Frequency polygon
v <- c(15, 35, 20, 10, 5 ,15, 20, 15, 12, 13)
plot(v,type="o",xlab="Year",ylab="Profit",xlim=c(1,10),ylim=c(0,40),
col="green", main="RVRJC PHARMACEUTICAL FIRM")
#Histogram
v<-c(3,5,6,19,9,18,23,67,11,10,44,45,54,37,26,8,5,1)
hist(v,main="STUDENTS MARKS", xlab ="Weight",xlim=c(0,70), ylab="no.of
branches",ylim=c(0,5),col=rainbow(10))
#Pie chart
x<-c(5086,3179,1429,152,257,69)
lbls<-c("Total Income", "Interest Paid", "Salaries", "Rent",
"Others","Profit")
pie(x, labels = lbls,col=rainbow(6), main="Income and Expenditure of a
Bank")
#Scatter plot
x<-c(65,68,69,70,75,65,87,56,54)
y<-c(75,76,68,79,77,69,80,60,65)
plot(x,y,xlab="Weight",ylab="height",main="Weight vs Heigts")
24