0% found this document useful (0 votes)
23 views18 pages

IntroR 2

Uploaded by

moad77181
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views18 pages

IntroR 2

Uploaded by

moad77181
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 18

> x=9

> x1=11

> 1x=12
Error: unexpected symbol in "1x"

> x2="Rahmath"

> x2
[1] "Rahmath"

> 11+24
[1] 35

> 7*9
[1] 63

> y=6

> x+y
[1] 15

> x^2 + y^2


[1] 117

> x-y
[1] 3

> sqrt(x)
[1] 3

> log(x)
[1] 2.197225

> ?log #help for log

> log10(x)
[1] 0.9542425

> #This is a comment line

> x1=c(1,3,5,7,9)

> gender=c("male","female")

> 2:7
[1] 2 3 4 5 6 7
> seq(from=1,to=7,by=2)
[1] 1 3 5 7

> rep(1,time=5)
[1] 1 1 1 1 1

> rep("rahma",times=3)
[1] "rahma" "rahma" "rahma"

> rep(1:3,time=3)
[1] 1 2 3 1 2 3 1 2 3

> x1
[1] 1 3 5 7 9

> x1[3]
[1] 5

#Clear Console (Ctrl L)

> mat=matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=TRUE)
> mat
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
[3,] 7 8 9

> mat=matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=FALSE)
> mat
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

> mat*3
[,1] [,2] [,3]
[1,] 3 12 21
[2,] 6 15 24
[3,] 9 18 27

> mat-3
[,1] [,2] [,3]
[1,] -2 1 4
[2,] -1 2 5
[3,] 0 3 6

> mat*mat
[,1] [,2] [,3]
[1,] 1 16 49
[2,] 4 25 64
[3,] 9 36 81

> mat/6
[,1] [,2] [,3]
[1,] 0.1666667 0.6666667 1.166667
[2,] 0.3333333 0.8333333 1.333333
[3,] 0.5000000 1.0000000 1.500000

> m=mat

>m
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

> m[2,]
[1] 2 5 8

#create a Excel File of data and save as csv format #comma separated value

>x1=read.csv(file.choose(),header=T) # to import the excel csv file

>x2=read.table(file.choose(),header=T,sep=",")

#create a Excel File of data and save as txt format #tab delimited file

> x1=read.delim(file.choose(),header=T)

> x1=read.table(file.choose(),header=T,sep="\t")

#To export data from R to other formats

> write.table(Book3_1_,file = "bee.csv", sep=",")


(Book3_1_.....file in R,file = "bee.csv"…….destination file name, sep=",")

> names(LungCapData)
[1] "LungCap" "Age" "Height" "Smoke" "Gender" "Caesarean"

> rm(x1) #To remove file or data from R workspace

> dim(LungCapData) #dimension of data file


[1] 10 6

> head(x2) # head of file….first 6 rows


>head(x2,4) # head of file….first 4 rows
LungCap Age Height Smoke Gender Caesarean
1 6.475 6 62.1 no male no
2 10.125 18 74.7 yes female no
3 9.550 16 69.7 no female yes
4 11.125 14 71.0 no male no
5 4.800 5 56.9 no male no
6 6.225 11 58.7 no female no

> tail(x2) # Tail of file….last 6 rows

>sample(x2,5) #Sample of 5 data from x2

>table(x2$Height)

> names(x2) # names of data


[1] "LungCap" "Age" "Height" "Smoke" "Gender" "Caesarean"

> x2[-(4:722),] # all data except rows 4 to 722


LungCap Age Height Smoke Gender Caesarean
1 6.475 6 62.1 no male no
2 10.125 18 74.7 yes female no
3 9.550 16 69.7 no female yes
723 3.850 11 60.5 yes female no
724 9.825 15 64.9 no female no
725 7.100 10 67.7 no male no

> x2[5:9,] #data from 5 to 9 rows and all columns


LungCap Age Height Smoke Gender Caesarean
5 4.800 5 56.9 no male no
6 6.225 11 58.7 no female no
7 4.950 8 63.3 no male yes
8 7.325 11 70.4 no male no
9 8.875 15 70.5 no male no

> mean(x$Height) #mean of height column in file x


[1] 64.83628

> attach(x) # to separately identify the column variables without file name #detach opposite
> mean(Height)
[1] 64.83628

> class(Age)
[1] "integer"

> class(Smoke)
[1] "character"
> class(Height)
[1] "numeric"
>x$Gender=as.factor(x$Gender) # to convert the character data to factor

> f=x[Gender=="female",] #seprates the data of column Gender with female and stores in f
> m=x[Gender=="male",] #seprates the data of column Gender with male and stores in m

> maleover15=x[Gender=='male'& Age>15,] #seprates the data of column Gender with male
and Age over 15 stores in maleover15

> temp=Age>15 #logical statements


> temp[1:5]
[1] FALSE TRUE TRUE FALSE FALSE

> temp1=as.numeric(Age>15)
> temp1[1:5]
[1] 0 1 1 0 0
> malesmoke=Gender=="male" & Smoke=="yes"
> malesmoke[1:5]
[1] FALSE FALSE FALSE FALSE FALSE

> malesmoke=as.numeric(Gender=="male" & Smoke=="yes")


> malesmoke[1:5]
[1] 0 0 0 0 0

> moredata=cbind(x,malesmoke) #binds column-wise data in file x and in file malesmoke


> View(moredata)

> getwd() #get the current working directory


[1] "/Users/rahmathullabaig"

> Rwd="/Users/rahmathullabaig/TheR" #giving a name to the path of working directory


> setwd(Rwd)
> getwd()
[1] "/Users/rahmathullabaig/TheR"

> save.image("first.Rdata") # saves workspace in current directory under file name first
#can also be done using>>session>>save workspace As…
> load("first.Rdata") #loads workspace data saved in file first.Rdata
> load(file.choose())
#for R Studio preferences

# Using the 'APPLY' function in R


# read in the "StockExample.csv" data, and attach it
>StockData <- read.table(file="~/TheR/StockExample.csv", sep=",",
header=T,row.names=1)
# check the data
>StockData

# get the help menu


>?apply

# calculate the mean price of each stock


>apply(X=StockData, MARGIN=2, FUN=mean)

# calculate the mean price of each stock, removing any NAs


>apply(X=StockData, MARGIN=2, FUN=mean, na.rm=TRUE)

# store the mean in an object called AVG


>AVG <- apply(X=StockData, MARGIN=2, FUN=mean, na.rm=TRUE)
>AVG

# notice that we don't need to include "MARGIN", etc, as long


# as we enter info in the specified order
>apply(StockData, 2, mean, na.rm=TRUE)

# do the same, but using the ColMeans command


>colMeans(StockData, na.rm=TRUE)

# find the MAXIMUM stock price, for each stock


>apply(X=StockData, MARGIN=2, FUN=max, na.rm=TRUE)

# find the 20th and 80th PERCENTILE, for each stock


>apply(X=StockData, MARGIN=2, FUN=quantile, probs=c(0.2, .80), na.rm=TRUE)

# create a plot of each column, using a "line"


>apply(X=StockData, MARGIN=2, FUN=plot, type="l")

# we can also send the plot function more arguments, such as


# titles, axes labels, and so forth...
>apply(X=StockData, MARGIN=2, FUN=plot, type="l", main="stock", ylab="Price",
xlab="Day")
# now let's calculate the SUM of each row (MARGIN=1)
>apply(X=StockData, MARGIN=1, FUN=sum, na.rm=TRUE)

# do the same, but with the rowSums command


>rowSums(StockData, na.rm=TRUE)

# make a nice plot of these...


>plot(apply(X=StockData, MARGIN=1, FUN=sum, na.rm=TRUE), type="l"
,ylab="Total Market Value", xlab="Day", main="Market Trend")
# and add in some nice coloured points...
>points(apply(X=StockData, MARGIN=1, FUN=sum, na.rm=TRUE),
pch=16, col="blue")

#BarPlot and Pie Chart


>load("~/TheR/first.Rdata")
> ?barplot
> attach(x)
> count=table(Gender)
> count
Gender
female male
358 367
> percent=table(Gender)/725
> percent
Gender
female male
0.4937931 0.5062069
> barplot(count)
> barplot(count, main="Title", xlab = "Gender", ylab = "Count")
> barplot(percent)
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count")
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count", las=1)
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count", las=1, names.arg =
c("Female", "Male"))
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count", las=1, names.arg =
c("Female", "Male"), horiz = T)
> pie(count,main = "Title")
> pie(count,main = "Title", names.arg=c("Female","Male"))

#Box Plot

> boxplot(LungCap)
> quantile(LungCap,probs = c(0,0.25,0.5,0.75,1))
0% 25% 50% 75% 100%
0.507 6.150 8.000 9.800 14.675
> boxplot(LungCap,main="Boxplot",ylab="Lung Capacity", las=1)
> boxplot(LungCap,main="Boxplot",ylab="Lung Capacity", ylim=c(0,16), las=1)
> boxplot(LungCap~Gender)
> boxplot(LungCap[Gender=="female"], LungCap[Gender=="male"])
> AgeGroup=cut(Age,breaks = c(0,13,15,17,25),labels=c("<13","14/15","16/17","18+"))
#>cut-Divides x into intervals, breaks-cut points

> boxplot(LungCap,ylab="Lung Capacity",main="BoxPlot",las=1)


> boxplot(LungCap~Smoke,ylab="Lung Capacity",main="BoxPlot",las=1)
> boxplot(LungCap[Age>=18]~Smoke[Age>=18],ylab="Lung
Capacity",main="BoxPlot",las=1)

> boxplot(LungCap~Smoke*AgeGroup,ylab="Lung Capacity",main="BoxPlot",las=2)

> boxplot(LungCap~Smoke*AgeGroup,ylab="Lung
Capacity",main="BoxPlot",las=2,col=c(4,2))
#colur of box plot with blue and Red color

> hist(LungCap)
> hist(LungCap,freq=F)
> hist(LungCap,prob=T)
> hist(LungCap,prob=T,ylim = c(0,0.2))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = 14)
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = 7)
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = c(0,2,4,6,8,10,12,14,16))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5),main =
"Histogram", xlab = "Lung Capacity")
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5),main =
"Histogram", xlab = "Lung Capacity",las=1)

> lines(density(LungCap))
> lines(density(LungCap),col=2,lwd=3)

#Stem plot
> femaleLungCap=LungCap[Gender=="female"]
> stem(femaleLungCap)

> stem(femaleLungCap,scale=2)

#Scatter Plot
> cor(Age,Height)
[1] 0.8357368

> plot(Age,Height,main="ScatterPlot",xlab = "AGE",ylab = "HEIGHT",xlim =


c(0,20),pch=8, col=2,las=1)
>?text
>plot(Age,LungCap,main="Scatter Plot", las=1)
> text(x=5,y=12,label="Correlation=0.82") # text at x-axis “5” and y-axis “12” it writes label.
> text(x=5,y=12,label="Correlation=0.82",adj=0) # Label….begin at x-axis “5”
> text(x=5,y=12,label="Correlation=0.82",adj=1) # Label….ends at x-axis “5”
> text(x=5,y=12,label="Corr=0.82",adj=0,cex=0.5, col=4,font=4) #cex=size of font 50%,
col=4. Blue color, font=4 italic bold
>abline(h=mean(LungCap),col=2, lwd=2) # adds a horizontal line with col=2 red color with
lwd=2 line width is 2
>mtext(text="r=0.82", side=4, adj=1) # adds a text at Margins side=4 and at the end of
margin
>mtext(text="r=0.82", side=3, adj=1,las=1, col=2, font=4, cex=1.80)
# adds text at margin on top of plot as side=3, col=2 red, font=4 bolt italic, adj=1 top right
corner, cex=1.8 80% larger text size

#Legends
>plot(Age[Smoke=="no"], LungCap[Smoke=="no"],col=4, xlab = "Age", ylab = "LungCap")
# plots Age Vs LungCap for non smokers with blue color, adding a xlab and ylab
>points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],col=2)
#adds points to existing plot for Age Vs LungCap for smokers with red color
>legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),fill=c(4,2))
# adds a legend which begins x=3.5 and y=12 NON-SMOKE and SMOKE fill it 4 blue color
and 2 red color
>plot(Age[Smoke=="no"], LungCap[Smoke=="no"],col=4, xlab = "Age", ylab = "LungCap",
pch=16) #pch=16 plotting character 16 which is solid circle, 17 is triangle
> points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],col=2,pch=17)
> legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),col=c(4,2),pch=c(16,17))

>lines(smooth.spline(Age[Smoke=="no"],LungCap[Smoke=="no"]),col=4,lwd=3)
#adds lines with smooth spline for Age Vs LungCap for non-smokers with col=4 blue col and
lwd=3 line width 3 times
>lines(smooth.spline(Age[Smoke=="yes"],LungCap[Smoke=="yes"]),col=2,lwd=3,lty=2)
#adds lines with smooth spline for Age Vs LungCap for smokers with col=2 red col and
lwd=3 line width 3 times, lty=2 line type dashed line
>legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),col=c(4,2),lty = c(1,2))

#Binomial Distribution

>dbinom(x=3,size=20,p=1/6) # probability of exactly 3 success p=3


>dbinom(x=0:3,size=20,p=1/6) # probability of exactly 0,1,2,3 success p=0,1,2,3
>pbinom(q=3,size=20,p=1/6,lower.tail=T) #P(x<=3)=P(x=0)+ P(x=1)+ P(x=2)+ P(x=3)

#Poisson Distribution Lambda=7


>dpois(x=4, lambda=7) # probability of exactly 4 occurrences p=4
>dpois(x=0:4, lambda=7) #probability of exactly 0,1,2,3,4 occurrences p=0,1,2,3,4
>ppois(q=4, lambda=7,lower.tail=T) #P(x<=4)=P(x=0)+ P(x=1)+ P(x=2)+ P(x=3)+P(x=4)
>ppois(q=12, lambda=7,lower.tail=F) #P(x>=12)

#Normal Distribution

>pnorm(q=70, mean=75, sd=5, lower.tail=T) #P(x<=70)


>pnorm(q=85, mean=75, sd=5, lower.tail=F) #P(x>=85)
>qnorm(p=0.25, mean=75, sd=5, lower.tail=T) #Find Q1=First quartile
#plotting density
>x=seq(from=55,to=95,by=0.25)
>x1=dnorm(x,mean=75,sd=5)
>plot(x,x1)
>plot(x,x1,type = "l")
#random sample from normal distribution
>x2=rnorm(n=40, mean=75,sd=5)
>hist(x2)

#Correlation
#load the lung capacity data and attach
>plot(Age,LungCap,main = "scatter plot",las=1)
> cor(Age,LungCap,method = "pearson")
[1] 0.8196749
> cor(LungCap,Age,method = "pearson") #order does not effect
[1] 0.8196749
> cor(LungCap,Age, method = "spearman")
[1] 0.8172464
> cor(LungCap,Age, method = "kendall")
[1] 0.639576
> cor.test(LungCap,Age, method = "pearson")

Pearson's product-moment correlation

data: LungCap and Age


t = 38.476, df = 723, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7942660 0.8422217
sample estimates:
cor
0.8196749
> cor.test(LungCap,Age, method = "pearson",conf.level = 0.99)

Pearson's product-moment correlation

data: LungCap and Age


t = 38.476, df = 723, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
99 percent confidence interval:
0.7856499 0.8487564
sample estimates:
cor
0.8196749

#Regression
>mod=lm(LungCap~Age) #LungCap is Y data and Age is X data
>summary(mod)
Residuals:
Min 1Q Median 3Q Max
-4.7799 -1.0203 -0.0005 0.9789 4.2650

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.14686 0.18353 6.249 7.06e-10 ***
Age 0.54485 0.01416 38.476 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.526 on 723 degrees of freedom #Sqrt(mse)


Multiple R-squared: 0.6719, Adjusted R-squared: 0.6714
F-statistic: 1480 on 1 and 723 DF, p-value: < 2.2e-16
> plot(Age,LungCap)
> abline(mod)
>anova(mod)

> mod=lm(LungCap~Age)
> plot(mod)
> par(mfrow=c(2,2)) #plots the 4 plots in 2 by 2
> plot(mod)

#Cut command
#We will create height categories of A<50, B=50-55, C=55-60, D=60-65, E=65-70, F=70+
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"))
> Height[1:10]
[1] 62.1 74.7 69.7 71.0 56.9 58.7 63.3 70.4 70.5 59.2
> catheight[1:10]
[1] D F E F C C D F F C
Levels: A B C D E F
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"),
right = F) #If we want for example 60 to lie in D (By default it lies to left in C)

#Multiple Regression Models


> mod2=lm(LungCap~Age+Height)
> summary(mod2)
Call:
lm(formula = LungCap ~ Age + Height)
Residuals:
Min 1Q Median 3Q Max
-3.4080 -0.7097 -0.0078 0.7167 3.1679

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11.747065 0.476899 -24.632 < 2e-16 ***
#estimated mean Lung Capacity for someone with zero Height and Age
Age 0.126368 0.017851 7.079 3.45e-12 ***
#We associate an increase in one year of age with an increase of 0.126 in lung Capacity
Height 0.278432 0.009926 28.051 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.056 on 722 degrees of freedom


#How far observed Lung Capacity are from the predicted or fitted lung capacity(Ý)……idea
of error e=Y- Ý
Multiple R-squared: 0.843, Adjusted R-squared: 0.8425
#84% variation in lung capacity can be explained by our model by Age And height
F-statistic: 1938 on 2 and 722 DF, p-value: < 2.2e-16

> mod2=lm(LungCap~Age+Height+Smoke+Gender+Caesarean)
> summary(mod2)
> plot(mod2)
#Dummy indicators
#We will create height categories of A<50, B=50-55, C=55-60, D=60-65, E=65-70, F=70+
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"))

> mod1=lm(LungCap~catheight)
> summary(mod1)

Call:
lm(formula = LungCap ~ catheight)

Residuals:
Min 1Q Median 3Q Max
-4.0074 -0.7996 -0.0324 0.7935 3.8754

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.1486 0.2944 7.298 7.75e-13 ***
catheightB 1.5329 0.3424 4.476 8.83e-06 ***
catheightC 3.2768 0.3159 10.373 < 2e-16 ***
catheightD 5.0676 0.3102 16.335 < 2e-16 ***
catheightE 6.5837 0.3083 21.358 < 2e-16 ***
catheightF 8.6510 0.3083 28.065 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.249 on 719 degrees of freedom


Multiple R-squared: 0.7814, Adjusted R-squared: 0.7798
F-statistic: 513.9 on 5 and 719 DF, p-value: < 2.2e-16

> mean(LungCap[catheight=="A"])
[1] 2.148611
> mean(LungCap[catheight=="C"]) #Mean=2.15+1.51(0)+3.25(1)+5.02(0)+….=5.4
[1] 5.42542

#More on non-dependent variables like Age and Smoke with correlation of 0.2
> head(Smoke)
[1] "no" "yes" "no" "no" "no" "no"
> r=as.numeric(Smoke=="yes")
> head(r)
[1] 0 1 0 0 0 0
> cor(Age,r)
[1] 0.2112322

> mod1=lm(LungCap~Age+Smoke)
> summary(mod1)
For a smoker For a non-Smoker

For a smoker mean lung capacity is decreased by -0.649

#Interaction of multi-regression model


> mod=lm(LungCap~Age+Smoke+Age:Smoke) #Interaction Age:Smoke
> summary(mod)
The interaction is not statastically siginificant (0.377)

#IF ELSE Statement


if(x$InternetService[3]=="Fiber optic")
{print("Fiber optic")
}else if(x$InternetService[3]=="DSL")
{print("DSL")
}else {
print("No Internet service is not fiber optic nor DSL")}

#SWITCH Statement
switch(as.character(x$gender[6]),
"Male"=x$MonthlyCharges[6]*0.8,"Female"=x$MonthlyCharges[6]*0.5)
#If x column gender is male then 20% discount , if female 50% discount

#For statement to count number of DSL service users


count1=0
for (i in 1:nrow(x)) {
if(x$InternetService[i]=="DSL"){
count1=count1+1

}
print(count1)
}

#For statement1 to count number of DSL service users


count1=0
for (val in x$InternetService){
if(val=="DSL")
count1=count1+1
}
print(count1)

#While statement to count number of Tenure are 2 months


count12=0
i=1
while (i<=nrow(x)) {
if(x$tenure[i]==2) {
count12=count12+1
}
i=i+1
}

#####accuracy

fit1 <- rwf(x[, 1], h = 1)


summary(fit1)
accuracy(fit1)
#data in time series
dat_ts <- ts(x[, 1], start =1, end = 25, frequency = 1)
#SES model
se_model <- ses(dat_ts, h = 1)
summary(se_model)
accuracy(se_model)
#Holt model
holt_model <- holt(dat_ts, h = 1)
summary(holt_model)
accuracy(holt_model)
#ARIMA model
arima_model <- auto.arima(dat_ts)
summary(arima_model)
accuracy(arima_model)

You might also like