0% found this document useful (0 votes)
11 views18 pages

Module-2 String, Date and Time, Data Preparation Example Code

Uploaded by

sreelakshmikn33
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views18 pages

Module-2 String, Date and Time, Data Preparation Example Code

Uploaded by

sreelakshmikn33
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 18

# String

c("string1","string2")
paste(c("Pine","Red"),"Apple")
paste0(c("Pine","Red"),"Apple")
paste(c("Pine","Red"),"Apple",sep="-")
paste(c("Pine","Red"),"Apple",sep="-",collapse=",")

x<-c(1:10)^3
x
toString(x)
toString(x,5)

#cat function
cat(c("Red", "Pine"),"Apple")

#noquote function
a<-c("I","am","a","Data Scientist")
a
noquote(a)

#formatC()
h<-c(4.569,8.981,27.772)
h
formatC(h)
formatC(h,digits=3)
formatC(h,digits=3,width=5)
formatC(h,digits=3,format="e")
formatC(h,digits=3,flag="+")

#format function
format(h)
format(h,digits=3)
format(h,digits=3,trim=TRUE)

#sprintf() function
x<-c(1,2,3)
sprintf("The number %d in the list =%f",x,h)
#cat()function with special characters such as \t,\n.\\,\\",\,'
cat("Black\tBerry",fill=TRUE)
cat("Black\nBerry",fill=TRUE)
cat("Black\\Berry",fill=TRUE)
cat("Black\"Berry",fill=TRUE)
cat("Black\'Berry",fill=TRUE)
cat('Black"Berry',fill=TRUE)
cat("Black'Berry",fill=TRUE)

#toupper(),tolower(),substring()
toupper("The cat is on the wall")
tolower("The cat is on the wall")
substring("The cat is on the wall",3,10)
substr("The cat is on the wall",3,10)
substring("The cat is on the wall",5,10)

#strsplit() function
strsplit("I like Banana, Orange and Pineapple"," ")
typeof(strsplit("I like Banana, Orange and Pineapple"," "))

#String code and Output

> x<-1:20
> c("string1","string2")
[1] "string1" "string2"
> paste(c("Pine","Red"),"Apple")
[1] "Pine Apple" "Red Apple"
> paste0(c("Pine","Red"),"Apple")
[1] "PineApple" "RedApple"
> paste(c("Pine","Red"),"Apple",sep="-")
[1] "Pine-Apple" "Red-Apple"
> paste(c("Pine","Red"),"Apple",sep="-",collapse=",")
[1] "Pine-Apple,Red-Apple"
>
> x<-c(1:10)^3
> x
[1] 1 8 27 64 125 216 343 512 729 1000
> toString(x)
[1] "1, 8, 27, 64, 125, 216, 343, 512, 729, 1000"
> toString(x,5)
[1] "1,...."
>
> #cat function
> cat(c("Red", "Pine"),"Apple")
Red Pine Apple>
> #noquote function
> a<-c("I","am","a","Data Scientist")
> a
[1] "I" "am" "a"
[4] "Data Scientist"
> noquote(a)
[1] I am a
[4] Data Scientist
>
> #formatC()
> h<-c(4.569,8.981,27.772)
> h
[1] 4.569 8.981 27.772
> formatC(h)
[1] "4.569" "8.981" "27.77"
> formatC(h,digits=3)
[1] "4.57" "8.98" "27.8"
> formatC(h,digits=3,width=5)
[1] " 4.57" " 8.98" " 27.8"
> formatC(h,digits=3,format="e")
[1] "4.569e+00" "8.981e+00" "2.777e+01"
> formatC(h,digits=3,flag="+")
[1] "+4.57" "+8.98" "+27.8"
>
> #format function
> format(h)
[1] " 4.569" " 8.981" "27.772"
> format(h,digits=3)
[1] " 4.57" " 8.98" "27.77"
> format(h,digits=3,trim=TRUE)
[1] "4.57" "8.98" "27.77"
>
> #sprintf() function
> x<-c(1,2,3)
> sprintf("The number %d in the list =%f",x,h)
[1] "The number 1 in the list =4.569000"
[2] "The number 2 in the list =8.981000"
[3] "The number 3 in the list =27.772000"
> #cat()function with special characters such as \t,\n.\\,\\",\,'
> cat("Black\tBerry",fill=TRUE)
Black Berry
> cat("Black\nBerry",fill=TRUE)
Black
Berry
> cat("Black\\Berry",fill=TRUE)
Black\Berry
> cat("Black\"Berry",fill=TRUE)
Black"Berry
> cat("Black\'Berry",fill=TRUE)
Black'Berry
> cat('Black"Berry',fill=TRUE)
Black"Berry
> cat("Black'Berry",fill=TRUE)
Black'Berry
>
> #toupper(),tolower(),substring()
> toupper("The cat is on the wall")
[1] "THE CAT IS ON THE WALL"
> tolower("The cat is on the wall")
[1] "the cat is on the wall"
> substring("The cat is on the wall",3,10)
[1] "e cat is"
> substr("The cat is on the wall",3,10)
[1] "e cat is"
> substring("The cat is on the wall",5,10)
[1] "cat is"
>
> #strsplit() function
> strsplit("I like Banana, Orange and Pineapple"," ")
[[1]]
[1] "I" "like" "Banana," "Orange"
[5] "and" "Pineapple"

> typeof(strsplit("I like Banana, Orange and Pineapple"," "))


[1] "list"
#Date and Time
> date1<-strptime("22:15:45 22/08/2015", "%H:%M:%S %d/%m/%Y")
> date1
[1] "2015-08-22 22:15:45 IST"
> date1<-strptime("22:15:45 22/08/2015", "%H:%M:%S %d-%m/%Y")
> date1
[1] NA

> strftime(Sys.Date(),"It's %l:%M:%p on %A %d %B, %Y.")


[1] "It's 12:00:AM on Thursday 11 January, 2024."

> Sys.timezone()
[1] "Asia/Calcutta"

> Sys.getlocale("LC_TIME")
[1] "English_United States.1252"

> strftime(Sys.time(),tz="UTC")
[1] "2024-01-11 04:39:03"
> strftime(Sys.time(),tz="IST")
[1] "2024-01-11 04:39:32"
Warning message:
In as.POSIXlt.POSIXct(x, tz = tz) : unknown timezone 'IST'
> strftime(Sys.time(),tz="UTC-5")
[1] "2024-01-11 09:40:07"
Warning message:
In as.POSIXlt.POSIXct(x, tz = tz) : unknown timezone 'UTC-5'

Calculation with Dtaes


> ct<-as.POSIXct(Sys.time())
> lt<-as.POSIXlt(Sys.time())
> dt<-as.Date(Sys.time())
> ct
[1] "2024-01-11 10:14:56 IST"
> ct+2500
[1] "2024-01-11 10:56:36 IST"
> lt
[1] "2024-01-11 10:15:17 IST"
> lt+20
[1] "2024-01-11 10:15:37 IST"
> dt
[1] "2024-01-11"
> dt+7
[1] "2024-01-18"

Difference between two time

> dt1<-as.Date("10/10/2020","%d/%m/%Y")
> dt2<-as.Date("11/01/2024","%d/%m/%Y")

> dt1
[1] "2020-10-10"

> dt2
[1] "2024-01-11"

> dif<-dt2-dt1
> dif
Time difference of 1188 days

> dt1+dt2
Error in `+.Date`(dt1, dt2) : binary + is not defined for "Date" objects

> difftime(dt2,dt1)
Time difference of 1188 days

> difftime(dt2,dt1,units="mins")
Time difference of 1710720 mins

> difftime(dt2,dt1,units="sec")
Time difference of 102643200 secs

> difftime(dt2,dt1,units="hours")
Time difference of 28512 hours

> difftime(dt2,dt1,units="days")
Time difference of 1188 days
> difftime(dt2,dt1,units="weeks")
Time difference of 169.7143 weeks

Seq() function

> seq(dt1,dt2,by="1 year")


[1] "2020-10-10" "2021-10-10" "2022-10-10" "2023-10-10"

> seq(dt1,dt2,by="500 days")


[1] "2020-10-10" "2022-02-22" "2023-07-07"

> mean(seq(dt1,dt2,by="1 year"))


[1] "2022-04-10"

> summary(seq(dt1,dt2,by="1 year"))


Min. 1st Qu. Median Mean 3rd Qu.
"2020-10-10" "2021-07-10" "2022-04-10" "2022-04-10" "2023-01-09"
Max.
"2023-10-10"

> install.packages("lubridate")
> library(lubridate)
> ymd("2000/09/25","2000-9-25","2000*9.25")
[1] "2000-09-25" "2000-09-25" "2000-09-25"

> dt_format<-stamp("I purchased on sunday,the 10th of october 2023 at 6:00:00


pm")
> dt1<-strptime("2024-01-11 12:15:00","%Y-%m-%d %H:%M:%S")
> dt1
[1] "2024-01-11 12:15:00 IST"
> dt_format(dt1)
[1] "I purchased on Thursday,the 11th of January 2024 at 12:15:00PM"

> y<-dyears(1:5)
> y
[1] "31557600s (~1 years)" "63115200s (~2 years)"
[3] "94672800s (~3 years)" "126230400s (~4 years)"
[5] "157788000s (~5 years)"

> w<-dweeks(1:4)
> w
[1] "604800s (~1 weeks)" "1209600s (~2 weeks)" "1814400s (~3 weeks)"
[4] "2419200s (~4 weeks)"
> d<-ddays(1:10)
> d
[1] "86400s (~1 days)" "172800s (~2 days)"
[3] "259200s (~3 days)" "345600s (~4 days)"
[5] "432000s (~5 days)" "518400s (~6 days)"
[7] "604800s (~1 weeks)" "691200s (~1.14 weeks)"
[9] "777600s (~1.29 weeks)" "864000s (~1.43 weeks)"
> today()
[1] "2024-01-11"

> today()+y
[1] "2025-01-10 06:00:00 UTC" "2026-01-10 12:00:00 UTC"
[3] "2027-01-10 18:00:00 UTC" "2028-01-11 00:00:00 UTC"
[5] "2029-01-10 06:00:00 UTC"

> y<-years(1:7)
> y
[1] "1y 0m 0d 0H 0M 0S" "2y 0m 0d 0H 0M 0S" "3y 0m 0d 0H 0M 0S"
[4] "4y 0m 0d 0H 0M 0S" "5y 0m 0d 0H 0M 0S" "6y 0m 0d 0H 0M 0S"
[7] "7y 0m 0d 0H 0M 0S"

> today()+y
[1] "2025-01-11" "2026-01-11" "2027-01-11" "2028-01-11" "2029-01-11"
[6] "2030-01-11" "2031-01-11"

> yr<-dyears(5)
> yr
[1] "157788000s (~5 years)"

> as.period(yr)
[1] "5y 0m 0d 0H 0M 0S"
# Data Preparation
data("mammals", package = "MASS")

head(mammals)

body brain
Arctic fox 3.385 44.5
Owl monkey 0.480 15.5
Mountain beaver 1.350 8.1
Cow 465.000 423.0
Grey wolf 36.330 119.5
Goat 27.660 115.0

test1<-readLines("C:/SPB_DATA/my_txt.txt")
> test1
[1] "the first line" "the second line" "the third line"

writeLines("the third line","C:/SPB_DATA/my_txt1.txt")

a<-c(1,2,3)

b<-c("a","b","c")

d<-c('Y','N','Y')

df1<-data.frame(a,b,d)

df1
convt<-function(x)

Y<-rep.int(NA,length(x))

Y[x=='Y']<-TRUE

Y[x=='N']<-FALSE

df1$d<-convt(df1$d)

df1

a b d
1 1 a Y
2 2 b N
3 3 c Y

a b d
1 1 a TRUE
2 2 b FALSE
3 3 c TRUE

grep('my','This is my pen')

grepl('my','This is my pen')

sub("my","your","This is my pen")

gsub('my','your','This is my pen')

> grep('my','This is my pen')


[1] 1
> grepl('my','This is my pen')
[1] TRUE
> gsub('my','your','This is my pen')
[1] "This is your pen"
Manipulating Data Frame

with(),within(),sort(),order()
> name<-c("Jhon","Peter","Mark")
> start_date<-c("1980-10-10","1999-12-12","1900-04-05")
> end_date<-c("1989-03-08","2004-04-20","2000-09-25")
> service<-data.frame(name,start_date,end_date)
> service
name start_date end_date
1 Jhon 1980-10-10 1989-03-08
2 Peter 1999-12-12 2004-04-20
3 Mark 1900-04-05 2000-09-25

> service$period<-as.Date(service$end_date)-as.Date(service$start_date)
> service

name start_date end_date period


1 Jhon 1980-10-10 1989-03-08 3071 days
2 Peter 1999-12-12 2004-04-20 1591 days
3 Mark 1900-04-05 2000-09-25 36698 days

The same can be achieved using the function with()


> service$period<-with(service,as.Date(end_date)-as.Date(start_date))
> service
name start_date end_date period
1 Jhon 1980-10-10 1989-03-08 3071 days
2 Peter 1999-12-12 2004-04-20 1591 days
3 Mark 1900-04-05 2000-09-25 36698 days

Within() function can be used to add multiple columns to the dataframe.

service<-within(service,
{period<-as.Date(end_date)-as.Date(start_date)
highperiod<-period>2000
})
service

Output
name start_date end_date period highperiod
1 Jhon 1980-10-10 1989-03-08 3071 days TRUE
2 Peter 1999-12-12 2004-04-20 1591 days FALSE
3 Mark 1900-04-05 2000-09-25 36698 days TRUE

Sort() and order()function

x<-c(5,10,3,15,6,8)
sort(x)
sort(x,decreasing = TRUE)
y<-c("X","AB","Deer","For","Moon")
sort(y)
sort(y,decreasing = TRUE)
order(x)
x[order(x)]
identical(sort(x),x[order(x)])
Output
> x<-c(5,10,3,15,6,8)
> sort(x)
[1] 3 5 6 8 10 15
> sort(x,decreasing = TRUE)
[1] 15 10 8 6 5 3
> y<-c("X","AB","Deer","For","Moon")
> sort(y)
[1] "AB" "Deer" "For" "Moon" "X"
> sort(y,decreasing = TRUE)
[1] "X" "Moon" "For" "Deer" "AB"
> order(x)
[1] 3 1 5 6 2 4
> x[order(x)]
[1] 3 5 6 8 10 15
> identical(sort(x),x[order(x)])
[1] TRUE

Order() function is more useful than the sort() function as it can be used to manipulate the data frame
easily.
name<-c("Jhon","Peter","Mark")
start_date<-c("1980-10-10","1999-12-12","1900-04-05")
end_date<-c("1989-03-08","2004-04-20","2000-09-25")
service<-data.frame(name,start_date,end_date)
service
startdt<-order(service$start_date)
service.ordered<-service[startdt,]
service.ordered

output

> name<-c("Jhon","Peter","Mark")
> start_date<-c("1980-10-10","1999-12-12","1900-04-05")
> end_date<-c("1989-03-08","2004-04-20","2000-09-25")
> service<-data.frame(name,start_date,end_date)
> service
name start_date end_date
1 Jhon 1980-10-10 1989-03-08
2 Peter 1999-12-12 2004-04-20
3 Mark 1900-04-05 2000-09-25
> startdt<-order(service$start_date)
> service.ordered<-service[startdt,]
> service.ordered
name start_date end_date
3 Mark 1900-04-05 2000-09-25
1 Jhon 1980-10-10 1989-03-08
2 Peter 1999-12-12 2004-04-20

Data Reshaping-Cbind() and rbind()


city<-c('Mandya','Mysore','Chennai')
state<-c('KA','KA','TN')
zipcode<-c('571401','570001','600001')
address<-cbind(city,state,zipcode)
address
new.address<-data.frame(
city=c('Banglore','Coimbatore'),
state=c('KA','TN'),
zipcode=c('530068','631027'),
stringsAsFactors = FALSE)
print(new.address)
all.address<-rbind(address,new.address)
all.address

> city<-c('Mandya','Mysore','Chennai')
> state<-c('KA','KA','TN')
> zipcode<-c('571401','570001','600001')
> address<-cbind(city,state,zipcode)
> address
city state zipcode
[1,] "Mandya" "KA" "571401"
[2,] "Mysore" "KA" "570001"
[3,] "Chennai" "TN" "600001"
>
> new.address<-data.frame(
+ city=c('Banglore','Coimbatore'),
+ state=c('KA','TN'),
+ zipcode=c('530068','631027'),
+ stringsAsFactors = FALSE)
> print(new.address)
city state zipcode
1 Banglore KA 530068
2 Coimbatore TN 631027
> all.address<-rbind(address,new.address)
> all.address
city state zipcode
1 Mandya KA 571401
2 Mysore KA 570001
3 Chennai TN 600001
4 Banglore KA 530068
5 Coimbatore TN 631027

Grouping Function
apply(),lapply(),sapply(),vapply(),mapply(),rapply(),tapply()
M<-matrix(seq(1,16),4,4)
M
apply(M,1,min)
apply(M,2,max)
M<-array(seq(32),dim=c(4,4,2))
M
apply(M,1,sum)
apply(M,c(1,2),sum)

output
> M<-matrix(seq(1,16),4,4)
> M
[,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
> apply(M,1,min)
[1] 1 2 3 4
> apply(M,2,max)
[1] 4 8 12 16
> M<-array(seq(32),dim=c(4,4,2))
> M
, , 1

[,1] [,2] [,3] [,4]


[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16

, , 2

[,1] [,2] [,3] [,4]


[1,] 17 21 25 29
[2,] 18 22 26 30
[3,] 19 23 27 31
[4,] 20 24 28 32

> apply(M,1,sum)
[1] 120 128 136 144

[(1+5+9+13+17+21+25+29) (2+6+10+14+18+22+26+30) (3+7+11+15+19+23+27+31) (4+8+


12+16+20+24+28+32)]

> apply(M,c(1,2),sum)
[,1] [,2] [,3] [,4]
[1,] 18 26 34 42
[2,] 20 28 36 44
[3,] 22 30 38 46
[4,] 24 32 40 48

[17+1, 21+5,25+9,29+13, ……..]

lapply()
#lapply() function applied on each elements of list
x<-list(a=1,b=1:3,c=10:100)
x
lapply(x, FUN=length)
lapply(x,FUN=sum)

> #lapply()
> x<-list(a=1,b=1:3,c=10:100)
> x
$a
[1] 1

$b
[1] 1 2 3

$c
[1] 10 11 12 13 14 15 16 17 18 19 20 21
[13] 22 23 24 25 26 27 28 29 30 31 32 33
[25] 34 35 36 37 38 39 40 41 42 43 44 45
[37] 46 47 48 49 50 51 52 53 54 55 56 57
[49] 58 59 60 61 62 63 64 65 66 67 68 69
[61] 70 71 72 73 74 75 76 77 78 79 80 81
[73] 82 83 84 85 86 87 88 89 90 91 92 93
[85] 94 95 96 97 98 99 100

> lapply(x, FUN=length)


$a
[1] 1

$b
[1] 3

$c
[1] 91

> lapply(x,FUN=sum)
$a
[1] 1

$b
[1] 6

$c
[1] 5005

sapply() function

#sapply()
x<-list(a=1,b=1:3,c=10:100)
x
sapply(x, FUN=length)
sapply(x,FUN=sum)

> #sapply()
> x<-list(a=1,b=1:3,c=10:100)
> x
$a
[1] 1

$b
[1] 1 2 3

$c
[1] 10 11 12 13 14 15 16 17 18 19 20 21
[13] 22 23 24 25 26 27 28 29 30 31 32 33
[25] 34 35 36 37 38 39 40 41 42 43 44 45
[37] 46 47 48 49 50 51 52 53 54 55 56 57
[49] 58 59 60 61 62 63 64 65 66 67 68 69
[61] 70 71 72 73 74 75 76 77 78 79 80 81
[73] 82 83 84 85 86 87 88 89 90 91 92 93
[85] 94 95 96 97 98 99 100

> sapply(x, FUN=length)


a b c
1 3 91
> sapply(x,FUN=sum)
a b c
1 6 5005

vapply()
x<-list(a=1,b=1:3,c=10:100)
x
vapply(x, FUN=length,FUN.VALUE=0L)

> #vapply()
> x<-list(a=1,b=1:3,c=10:100)
> x
$a
[1] 1

$b
[1] 1 2 3

$c
[1] 10 11 12 13 14 15 16 17 18 19 20 21
[13] 22 23 24 25 26 27 28 29 30 31 32 33
[25] 34 35 36 37 38 39 40 41 42 43 44 45
[37] 46 47 48 49 50 51 52 53 54 55 56 57
[49] 58 59 60 61 62 63 64 65 66 67 68 69
[61] 70 71 72 73 74 75 76 77 78 79 80 81
[73] 82 83 84 85 86 87 88 89 90 91 92 93
[85] 94 95 96 97 98 99 100
> vapply(x, FUN=length,FUN.VALUE=0L)
a b c
1 3 91

mapply() function
> mapply(sum,1:5,1:5,1:5)
[1] 3 6 9 12 15

[1+1+1 2+2+2 3+3+3 4+4+4 5+5+5]

> mapply(rep,1:4,4:1)
[[1]]
[1] 1 1 1 1

[[2]]
[1] 2 2 2

[[3]]
[1] 3 3

[[4]]
[1] 4

rapply()
myFun<-function(x)
{
if(is.character(x))
{
return(paste(x,'!',sep=" "))
}
else
{
return(x+1)
}
}
l<-list(a=list(a1="Boo",b1=2,c1="Eeeks"),
b=3,c="Yikes",d=list(a2=1,b2=list(a3="Hey",b3=5)))
rapply(l,myFun)

output
a.a1 a.b1 a.c1 b c
"Boo !" "3" "Eeeks !" "4" "Yikes !"

d.a2 d.b2.a3 d.b2.b3


"2" "Hey !" "6"

> rapply(l,myFun,how='replace')
$a
$a$a1
[1] "Boo !"

$a$b1
[1] 3

$a$c1
[1] "Eeeks !"

$b
[1] 4

$c
[1] "Yikes !"

$d
$d$a2
[1] 2

$d$b2
$d$b2$a3
[1] "Hey !"
$d$b2$b3
[1] 6

tapply() function
x<-1:20
x
y<-factor(rep(letters[1:5],each=4))
y
tapply(x,y,sum)

output
> x<-1:20

> x
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

> y<-factor(rep(letters[1:5],each=4))

> y
[1] a a a a b b b b c c c c d d d d e e e e
Levels: a b c d e

> tapply(x,y,sum)
a b c d e
10 26 42 58 74

by() function

> cta<-tapply(iris$Sepal.Width,iris$Species,summary)
> cba<-by(iris$Sepal.Width,iris$Species,summary)
> cta
$setosa
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.300 3.200 3.400 3.428 3.675 4.400

$versicolor
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.525 2.800 2.770 3.000 3.400

$virginica
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.200 2.800 3.000 2.974 3.175 3.800

> cba
iris$Species: setosa
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.300 3.200 3.400 3.428 3.675 4.400
------------------------------------------------------------
iris$Species: versicolor
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.525 2.800 2.770 3.000 3.400
------------------------------------------------------------
iris$Species: virginica
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.200 2.800 3.000 2.974 3.175 3.800

aggregate() function

att<-tapply(iris$Sepal.Length,iris$Species,mean)
agt<-aggregate(iris$Sepal.Length,list(iris$Species),mean)
att
agt

output
> att<-tapply(iris$Sepal.Length,iris$Species,mean)
> agt<-aggregate(iris$Sepal.Length,list(iris$Species),mean)
> att
setosa versicolor virginica
5.006 5.936 6.588
> agt
Group.1 x
1 setosa 5.006
2 versicolor 5.936
3 virginica 6.588

You might also like