Module-2 String, Date and Time, Data Preparation Example Code
Module-2 String, Date and Time, Data Preparation Example Code
c("string1","string2")
paste(c("Pine","Red"),"Apple")
paste0(c("Pine","Red"),"Apple")
paste(c("Pine","Red"),"Apple",sep="-")
paste(c("Pine","Red"),"Apple",sep="-",collapse=",")
x<-c(1:10)^3
x
toString(x)
toString(x,5)
#cat function
cat(c("Red", "Pine"),"Apple")
#noquote function
a<-c("I","am","a","Data Scientist")
a
noquote(a)
#formatC()
h<-c(4.569,8.981,27.772)
h
formatC(h)
formatC(h,digits=3)
formatC(h,digits=3,width=5)
formatC(h,digits=3,format="e")
formatC(h,digits=3,flag="+")
#format function
format(h)
format(h,digits=3)
format(h,digits=3,trim=TRUE)
#sprintf() function
x<-c(1,2,3)
sprintf("The number %d in the list =%f",x,h)
#cat()function with special characters such as \t,\n.\\,\\",\,'
cat("Black\tBerry",fill=TRUE)
cat("Black\nBerry",fill=TRUE)
cat("Black\\Berry",fill=TRUE)
cat("Black\"Berry",fill=TRUE)
cat("Black\'Berry",fill=TRUE)
cat('Black"Berry',fill=TRUE)
cat("Black'Berry",fill=TRUE)
#toupper(),tolower(),substring()
toupper("The cat is on the wall")
tolower("The cat is on the wall")
substring("The cat is on the wall",3,10)
substr("The cat is on the wall",3,10)
substring("The cat is on the wall",5,10)
#strsplit() function
strsplit("I like Banana, Orange and Pineapple"," ")
typeof(strsplit("I like Banana, Orange and Pineapple"," "))
> x<-1:20
> c("string1","string2")
[1] "string1" "string2"
> paste(c("Pine","Red"),"Apple")
[1] "Pine Apple" "Red Apple"
> paste0(c("Pine","Red"),"Apple")
[1] "PineApple" "RedApple"
> paste(c("Pine","Red"),"Apple",sep="-")
[1] "Pine-Apple" "Red-Apple"
> paste(c("Pine","Red"),"Apple",sep="-",collapse=",")
[1] "Pine-Apple,Red-Apple"
>
> x<-c(1:10)^3
> x
[1] 1 8 27 64 125 216 343 512 729 1000
> toString(x)
[1] "1, 8, 27, 64, 125, 216, 343, 512, 729, 1000"
> toString(x,5)
[1] "1,...."
>
> #cat function
> cat(c("Red", "Pine"),"Apple")
Red Pine Apple>
> #noquote function
> a<-c("I","am","a","Data Scientist")
> a
[1] "I" "am" "a"
[4] "Data Scientist"
> noquote(a)
[1] I am a
[4] Data Scientist
>
> #formatC()
> h<-c(4.569,8.981,27.772)
> h
[1] 4.569 8.981 27.772
> formatC(h)
[1] "4.569" "8.981" "27.77"
> formatC(h,digits=3)
[1] "4.57" "8.98" "27.8"
> formatC(h,digits=3,width=5)
[1] " 4.57" " 8.98" " 27.8"
> formatC(h,digits=3,format="e")
[1] "4.569e+00" "8.981e+00" "2.777e+01"
> formatC(h,digits=3,flag="+")
[1] "+4.57" "+8.98" "+27.8"
>
> #format function
> format(h)
[1] " 4.569" " 8.981" "27.772"
> format(h,digits=3)
[1] " 4.57" " 8.98" "27.77"
> format(h,digits=3,trim=TRUE)
[1] "4.57" "8.98" "27.77"
>
> #sprintf() function
> x<-c(1,2,3)
> sprintf("The number %d in the list =%f",x,h)
[1] "The number 1 in the list =4.569000"
[2] "The number 2 in the list =8.981000"
[3] "The number 3 in the list =27.772000"
> #cat()function with special characters such as \t,\n.\\,\\",\,'
> cat("Black\tBerry",fill=TRUE)
Black Berry
> cat("Black\nBerry",fill=TRUE)
Black
Berry
> cat("Black\\Berry",fill=TRUE)
Black\Berry
> cat("Black\"Berry",fill=TRUE)
Black"Berry
> cat("Black\'Berry",fill=TRUE)
Black'Berry
> cat('Black"Berry',fill=TRUE)
Black"Berry
> cat("Black'Berry",fill=TRUE)
Black'Berry
>
> #toupper(),tolower(),substring()
> toupper("The cat is on the wall")
[1] "THE CAT IS ON THE WALL"
> tolower("The cat is on the wall")
[1] "the cat is on the wall"
> substring("The cat is on the wall",3,10)
[1] "e cat is"
> substr("The cat is on the wall",3,10)
[1] "e cat is"
> substring("The cat is on the wall",5,10)
[1] "cat is"
>
> #strsplit() function
> strsplit("I like Banana, Orange and Pineapple"," ")
[[1]]
[1] "I" "like" "Banana," "Orange"
[5] "and" "Pineapple"
> Sys.timezone()
[1] "Asia/Calcutta"
> Sys.getlocale("LC_TIME")
[1] "English_United States.1252"
> strftime(Sys.time(),tz="UTC")
[1] "2024-01-11 04:39:03"
> strftime(Sys.time(),tz="IST")
[1] "2024-01-11 04:39:32"
Warning message:
In as.POSIXlt.POSIXct(x, tz = tz) : unknown timezone 'IST'
> strftime(Sys.time(),tz="UTC-5")
[1] "2024-01-11 09:40:07"
Warning message:
In as.POSIXlt.POSIXct(x, tz = tz) : unknown timezone 'UTC-5'
> dt1<-as.Date("10/10/2020","%d/%m/%Y")
> dt2<-as.Date("11/01/2024","%d/%m/%Y")
> dt1
[1] "2020-10-10"
> dt2
[1] "2024-01-11"
> dif<-dt2-dt1
> dif
Time difference of 1188 days
> dt1+dt2
Error in `+.Date`(dt1, dt2) : binary + is not defined for "Date" objects
> difftime(dt2,dt1)
Time difference of 1188 days
> difftime(dt2,dt1,units="mins")
Time difference of 1710720 mins
> difftime(dt2,dt1,units="sec")
Time difference of 102643200 secs
> difftime(dt2,dt1,units="hours")
Time difference of 28512 hours
> difftime(dt2,dt1,units="days")
Time difference of 1188 days
> difftime(dt2,dt1,units="weeks")
Time difference of 169.7143 weeks
Seq() function
> install.packages("lubridate")
> library(lubridate)
> ymd("2000/09/25","2000-9-25","2000*9.25")
[1] "2000-09-25" "2000-09-25" "2000-09-25"
> y<-dyears(1:5)
> y
[1] "31557600s (~1 years)" "63115200s (~2 years)"
[3] "94672800s (~3 years)" "126230400s (~4 years)"
[5] "157788000s (~5 years)"
> w<-dweeks(1:4)
> w
[1] "604800s (~1 weeks)" "1209600s (~2 weeks)" "1814400s (~3 weeks)"
[4] "2419200s (~4 weeks)"
> d<-ddays(1:10)
> d
[1] "86400s (~1 days)" "172800s (~2 days)"
[3] "259200s (~3 days)" "345600s (~4 days)"
[5] "432000s (~5 days)" "518400s (~6 days)"
[7] "604800s (~1 weeks)" "691200s (~1.14 weeks)"
[9] "777600s (~1.29 weeks)" "864000s (~1.43 weeks)"
> today()
[1] "2024-01-11"
> today()+y
[1] "2025-01-10 06:00:00 UTC" "2026-01-10 12:00:00 UTC"
[3] "2027-01-10 18:00:00 UTC" "2028-01-11 00:00:00 UTC"
[5] "2029-01-10 06:00:00 UTC"
> y<-years(1:7)
> y
[1] "1y 0m 0d 0H 0M 0S" "2y 0m 0d 0H 0M 0S" "3y 0m 0d 0H 0M 0S"
[4] "4y 0m 0d 0H 0M 0S" "5y 0m 0d 0H 0M 0S" "6y 0m 0d 0H 0M 0S"
[7] "7y 0m 0d 0H 0M 0S"
> today()+y
[1] "2025-01-11" "2026-01-11" "2027-01-11" "2028-01-11" "2029-01-11"
[6] "2030-01-11" "2031-01-11"
> yr<-dyears(5)
> yr
[1] "157788000s (~5 years)"
> as.period(yr)
[1] "5y 0m 0d 0H 0M 0S"
# Data Preparation
data("mammals", package = "MASS")
head(mammals)
body brain
Arctic fox 3.385 44.5
Owl monkey 0.480 15.5
Mountain beaver 1.350 8.1
Cow 465.000 423.0
Grey wolf 36.330 119.5
Goat 27.660 115.0
test1<-readLines("C:/SPB_DATA/my_txt.txt")
> test1
[1] "the first line" "the second line" "the third line"
a<-c(1,2,3)
b<-c("a","b","c")
d<-c('Y','N','Y')
df1<-data.frame(a,b,d)
df1
convt<-function(x)
Y<-rep.int(NA,length(x))
Y[x=='Y']<-TRUE
Y[x=='N']<-FALSE
df1$d<-convt(df1$d)
df1
a b d
1 1 a Y
2 2 b N
3 3 c Y
a b d
1 1 a TRUE
2 2 b FALSE
3 3 c TRUE
grep('my','This is my pen')
grepl('my','This is my pen')
sub("my","your","This is my pen")
gsub('my','your','This is my pen')
with(),within(),sort(),order()
> name<-c("Jhon","Peter","Mark")
> start_date<-c("1980-10-10","1999-12-12","1900-04-05")
> end_date<-c("1989-03-08","2004-04-20","2000-09-25")
> service<-data.frame(name,start_date,end_date)
> service
name start_date end_date
1 Jhon 1980-10-10 1989-03-08
2 Peter 1999-12-12 2004-04-20
3 Mark 1900-04-05 2000-09-25
> service$period<-as.Date(service$end_date)-as.Date(service$start_date)
> service
service<-within(service,
{period<-as.Date(end_date)-as.Date(start_date)
highperiod<-period>2000
})
service
Output
name start_date end_date period highperiod
1 Jhon 1980-10-10 1989-03-08 3071 days TRUE
2 Peter 1999-12-12 2004-04-20 1591 days FALSE
3 Mark 1900-04-05 2000-09-25 36698 days TRUE
x<-c(5,10,3,15,6,8)
sort(x)
sort(x,decreasing = TRUE)
y<-c("X","AB","Deer","For","Moon")
sort(y)
sort(y,decreasing = TRUE)
order(x)
x[order(x)]
identical(sort(x),x[order(x)])
Output
> x<-c(5,10,3,15,6,8)
> sort(x)
[1] 3 5 6 8 10 15
> sort(x,decreasing = TRUE)
[1] 15 10 8 6 5 3
> y<-c("X","AB","Deer","For","Moon")
> sort(y)
[1] "AB" "Deer" "For" "Moon" "X"
> sort(y,decreasing = TRUE)
[1] "X" "Moon" "For" "Deer" "AB"
> order(x)
[1] 3 1 5 6 2 4
> x[order(x)]
[1] 3 5 6 8 10 15
> identical(sort(x),x[order(x)])
[1] TRUE
Order() function is more useful than the sort() function as it can be used to manipulate the data frame
easily.
name<-c("Jhon","Peter","Mark")
start_date<-c("1980-10-10","1999-12-12","1900-04-05")
end_date<-c("1989-03-08","2004-04-20","2000-09-25")
service<-data.frame(name,start_date,end_date)
service
startdt<-order(service$start_date)
service.ordered<-service[startdt,]
service.ordered
output
> name<-c("Jhon","Peter","Mark")
> start_date<-c("1980-10-10","1999-12-12","1900-04-05")
> end_date<-c("1989-03-08","2004-04-20","2000-09-25")
> service<-data.frame(name,start_date,end_date)
> service
name start_date end_date
1 Jhon 1980-10-10 1989-03-08
2 Peter 1999-12-12 2004-04-20
3 Mark 1900-04-05 2000-09-25
> startdt<-order(service$start_date)
> service.ordered<-service[startdt,]
> service.ordered
name start_date end_date
3 Mark 1900-04-05 2000-09-25
1 Jhon 1980-10-10 1989-03-08
2 Peter 1999-12-12 2004-04-20
> city<-c('Mandya','Mysore','Chennai')
> state<-c('KA','KA','TN')
> zipcode<-c('571401','570001','600001')
> address<-cbind(city,state,zipcode)
> address
city state zipcode
[1,] "Mandya" "KA" "571401"
[2,] "Mysore" "KA" "570001"
[3,] "Chennai" "TN" "600001"
>
> new.address<-data.frame(
+ city=c('Banglore','Coimbatore'),
+ state=c('KA','TN'),
+ zipcode=c('530068','631027'),
+ stringsAsFactors = FALSE)
> print(new.address)
city state zipcode
1 Banglore KA 530068
2 Coimbatore TN 631027
> all.address<-rbind(address,new.address)
> all.address
city state zipcode
1 Mandya KA 571401
2 Mysore KA 570001
3 Chennai TN 600001
4 Banglore KA 530068
5 Coimbatore TN 631027
Grouping Function
apply(),lapply(),sapply(),vapply(),mapply(),rapply(),tapply()
M<-matrix(seq(1,16),4,4)
M
apply(M,1,min)
apply(M,2,max)
M<-array(seq(32),dim=c(4,4,2))
M
apply(M,1,sum)
apply(M,c(1,2),sum)
output
> M<-matrix(seq(1,16),4,4)
> M
[,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
> apply(M,1,min)
[1] 1 2 3 4
> apply(M,2,max)
[1] 4 8 12 16
> M<-array(seq(32),dim=c(4,4,2))
> M
, , 1
, , 2
> apply(M,1,sum)
[1] 120 128 136 144
> apply(M,c(1,2),sum)
[,1] [,2] [,3] [,4]
[1,] 18 26 34 42
[2,] 20 28 36 44
[3,] 22 30 38 46
[4,] 24 32 40 48
lapply()
#lapply() function applied on each elements of list
x<-list(a=1,b=1:3,c=10:100)
x
lapply(x, FUN=length)
lapply(x,FUN=sum)
> #lapply()
> x<-list(a=1,b=1:3,c=10:100)
> x
$a
[1] 1
$b
[1] 1 2 3
$c
[1] 10 11 12 13 14 15 16 17 18 19 20 21
[13] 22 23 24 25 26 27 28 29 30 31 32 33
[25] 34 35 36 37 38 39 40 41 42 43 44 45
[37] 46 47 48 49 50 51 52 53 54 55 56 57
[49] 58 59 60 61 62 63 64 65 66 67 68 69
[61] 70 71 72 73 74 75 76 77 78 79 80 81
[73] 82 83 84 85 86 87 88 89 90 91 92 93
[85] 94 95 96 97 98 99 100
$b
[1] 3
$c
[1] 91
> lapply(x,FUN=sum)
$a
[1] 1
$b
[1] 6
$c
[1] 5005
sapply() function
#sapply()
x<-list(a=1,b=1:3,c=10:100)
x
sapply(x, FUN=length)
sapply(x,FUN=sum)
> #sapply()
> x<-list(a=1,b=1:3,c=10:100)
> x
$a
[1] 1
$b
[1] 1 2 3
$c
[1] 10 11 12 13 14 15 16 17 18 19 20 21
[13] 22 23 24 25 26 27 28 29 30 31 32 33
[25] 34 35 36 37 38 39 40 41 42 43 44 45
[37] 46 47 48 49 50 51 52 53 54 55 56 57
[49] 58 59 60 61 62 63 64 65 66 67 68 69
[61] 70 71 72 73 74 75 76 77 78 79 80 81
[73] 82 83 84 85 86 87 88 89 90 91 92 93
[85] 94 95 96 97 98 99 100
vapply()
x<-list(a=1,b=1:3,c=10:100)
x
vapply(x, FUN=length,FUN.VALUE=0L)
> #vapply()
> x<-list(a=1,b=1:3,c=10:100)
> x
$a
[1] 1
$b
[1] 1 2 3
$c
[1] 10 11 12 13 14 15 16 17 18 19 20 21
[13] 22 23 24 25 26 27 28 29 30 31 32 33
[25] 34 35 36 37 38 39 40 41 42 43 44 45
[37] 46 47 48 49 50 51 52 53 54 55 56 57
[49] 58 59 60 61 62 63 64 65 66 67 68 69
[61] 70 71 72 73 74 75 76 77 78 79 80 81
[73] 82 83 84 85 86 87 88 89 90 91 92 93
[85] 94 95 96 97 98 99 100
> vapply(x, FUN=length,FUN.VALUE=0L)
a b c
1 3 91
mapply() function
> mapply(sum,1:5,1:5,1:5)
[1] 3 6 9 12 15
> mapply(rep,1:4,4:1)
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
rapply()
myFun<-function(x)
{
if(is.character(x))
{
return(paste(x,'!',sep=" "))
}
else
{
return(x+1)
}
}
l<-list(a=list(a1="Boo",b1=2,c1="Eeeks"),
b=3,c="Yikes",d=list(a2=1,b2=list(a3="Hey",b3=5)))
rapply(l,myFun)
output
a.a1 a.b1 a.c1 b c
"Boo !" "3" "Eeeks !" "4" "Yikes !"
> rapply(l,myFun,how='replace')
$a
$a$a1
[1] "Boo !"
$a$b1
[1] 3
$a$c1
[1] "Eeeks !"
$b
[1] 4
$c
[1] "Yikes !"
$d
$d$a2
[1] 2
$d$b2
$d$b2$a3
[1] "Hey !"
$d$b2$b3
[1] 6
tapply() function
x<-1:20
x
y<-factor(rep(letters[1:5],each=4))
y
tapply(x,y,sum)
output
> x<-1:20
> x
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
> y<-factor(rep(letters[1:5],each=4))
> y
[1] a a a a b b b b c c c c d d d d e e e e
Levels: a b c d e
> tapply(x,y,sum)
a b c d e
10 26 42 58 74
by() function
> cta<-tapply(iris$Sepal.Width,iris$Species,summary)
> cba<-by(iris$Sepal.Width,iris$Species,summary)
> cta
$setosa
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.300 3.200 3.400 3.428 3.675 4.400
$versicolor
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.525 2.800 2.770 3.000 3.400
$virginica
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.200 2.800 3.000 2.974 3.175 3.800
> cba
iris$Species: setosa
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.300 3.200 3.400 3.428 3.675 4.400
------------------------------------------------------------
iris$Species: versicolor
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.525 2.800 2.770 3.000 3.400
------------------------------------------------------------
iris$Species: virginica
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.200 2.800 3.000 2.974 3.175 3.800
aggregate() function
att<-tapply(iris$Sepal.Length,iris$Species,mean)
agt<-aggregate(iris$Sepal.Length,list(iris$Species),mean)
att
agt
output
> att<-tapply(iris$Sepal.Length,iris$Species,mean)
> agt<-aggregate(iris$Sepal.Length,list(iris$Species),mean)
> att
setosa versicolor virginica
5.006 5.936 6.588
> agt
Group.1 x
1 setosa 5.006
2 versicolor 5.936
3 virginica 6.588