0% found this document useful (0 votes)
27 views

R Functions

This document provides an introduction to working with data in R, including reading, cleaning, manipulating, and writing different types of data. It covers: 1) Reading different types of data like CSV files, text files, and built-in datasets into data frames and vectors. It demonstrates subsetting, aggregating and summarizing data. 2) Working with dates and times, converting between classes, and extracting components using formats. 3) Cleaning data by handling missing values, NAs, NaNs and Infs. 4) Writing data out in formats like CSV, text files and RDS. It also introduces dplyr and data.table packages for filtering, slicing and subsetting data

Uploaded by

Owais Shaikh
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
27 views

R Functions

This document provides an introduction to working with data in R, including reading, cleaning, manipulating, and writing different types of data. It covers: 1) Reading different types of data like CSV files, text files, and built-in datasets into data frames and vectors. It demonstrates subsetting, aggregating and summarizing data. 2) Working with dates and times, converting between classes, and extracting components using formats. 3) Cleaning data by handling missing values, NAs, NaNs and Infs. 4) Writing data out in formats like CSV, text files and RDS. It also introduces dplyr and data.table packages for filtering, slicing and subsetting data

Uploaded by

Owais Shaikh
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 8

Session-1 (Intro & Quick start)

# working dir
setwd("D:/R-BA")
# clear environment & history
rm(list = ls())
write("", file=".blank")
loadhistory(".blank")
unlink(".blank")
print
class
length
is.numeric
is.integer
is.character
is.vector
# vector of integers
c1 <- c(1L,2L,3L,4L,5L,6L,7L,8L,9L)
# vector of numeric / real numbers
c2 <- c(1,2,3,4,5,6,7,8,9)
# vector of numeric / real numbers
c3 <- c(1.1,2.2,3.3,4.4,5.5)
# vector of integers with :#
c4 <- c(1:20)
# vector of string
c5 <- c("aaa","bbb","ccc","ddd","xxx","yyy","zzz")
as.numeric
as.logical
as.character
c3
c3
c3
c3

<<<<-

c1
c1
c1
c1

+2
-2
*2
/2

# recycling rule - if two vectors are of unequal length, the shorter


# vector will be recycled in order to match the longer vector
# list
lst <- list(c1,c2,x,y)
is.list
# factors
vct <- c(1,0,1,0,0,0,0,1,1,1)
vct.f <- factor(vct, labels = c("private", "public"))

is.factor
# Matrices
m <- matrix(1:6, nrow = 2, ncol = 3)

# how matrix is populated using a vector & dim()


dim(m) <- c(2, 5)

i <- 2L
m2 <- m1
m2 <- m1
m2 <- m1
m2 <- m1

+i
-i
*i
/i

m3 <- m1 %*% m2 (Matrices Multiplication - Works)


m3 <- m1 %/% m2 (Matrices Division - no division allowed or possible)
# data frames
dfr <- data.frame(foo = 1:4, bar = c(T, T, F, F))
class(dfr)
names(dfr)
row.names(dfr)
attributes(dfr) - (gives information of the whole table, $names, $row.names, $class)
nrow(dfr)
ncol(dfr)
# Subsetting
x <- c("a", "b", "c", "d", "e", "f","a")
x[1]
x[2]
x[1:4]
x[x > "c"]
dfr <- data.frame(foo = 1:9, bar = c(T, T, F, F,T, T, F, F, NA), buf=rep("string",9))
rep("string",9)
# subsetting dataframes cols
dfr$foo
# show column foo
dfr$bar
# show column bar
dfr[,1]
# get the first col
dfr[,2]
# get the second col
dfr[,ncol(dfr)]
# get last col as ncol will give the total no of col
# subsetting dataframes row
dfr[1,]
# get the first row
dfr[3,]
# get the third row
dfr[nrow(dfr),]
# get last row

dfr["1",]
dfr['2',]

# get the row with name '1'


# get the row with name '2'

dfr[1:2,]
dfr[-1,]

# get the rows 1 to 2


# get the all rows except 1

dfr[-3:-7,]
dfr[,1:2]
dfr[,-1]
dfr[,-2:-3]
dfr[,-3:-3]

# get the all rows except 1 to 2, 8 to 9


# get the cols 1 to 2
# get the all cols except 1
# get the all rows except 2 to 3
# get the all rows except 3 to 3

x <- runif(10, min=0, max=10)


x
length(x)
summary(x)
sum(x)
min(x)
max(x)
median(x)
mode(x)
sd(x)
round(x)
floor(x)
ceiling(x)
trunc(x)

Session-2 (Reading & Checking data)


# check NA, NaN, Inf
numX <- NA
is.na(numX)
# check NAN

numX <- 0 / 0
is.na(numX)
is.nan(numX)
# check inf
numX <- 1 / 0
is.na(numX) (This will be false if its infinite)
is.infinite(numX)
#cleaning NAs from single vector
vcbBad <- is.na(vciX) (This will give T, F Values)
vciX[!vcbBad] (This will convert back to Values)
vcbGood <- complete.cases(vciX, vcsY)
vciX[vcbGood] (It will compare vciX with all the completed case of vcbGood)
vcsY[vcbGood] (It will compare vcsY with all the completed case of vcbGood)
# Basic Functions with Data Frames
names(dfr)
head(dfr)
tail(dfr)
length(dfr)
length(dfr$foo)
nrow(dfr)
ncol(dfr)
attributes(dfr)
sum(dfr$bar)
min(dfr$bar)
max(dfr$bar)
median(dfr$bar)
mode(dfr$bar)
sd(dfr$bar)
summary(dfr)
data.frame(foo.sd=sd(dfr$foo),bar.sd=sd(dfr$bar),buf.sd=sd(dfr$buf))
# Ready To Use R Datasets
library(datasets)
data()
# use airquality dataset
airquality
head(airquality)
nrow(airquality)

# get dataset of complete rows


vcbGoodAir <- complete.cases(airquality) (Get Complete Case of Dataset in T,F value)
dfrGoodAir <- airquality[vcbGoodAir, ] (Put the values into DFR so that it can convert back)
# use read.csv
dfrNifty <- read.csv("./data/nifty-data.csv", header=T, stringsAsFactors=F)
head(dfrNifty)
View(dfrNifty)
attributes(dfrNifty)
summary(dfrNifty)

nrow(dfrNifty)
ncol(dfrNifty)
# readLines
vcsUNProfile <- readLines("./data/un-profile.txt")
head(vcsUNProfile)
length(vcsUNProfile)
# view files
file.show("session-1.r")
file.show("./data/un-profile.txt")
file.show("./data/nifty-data.csv")
# readLines to read URL
conGoogle <- url("http://www.google.com/", "r")
vcsGoogle <- readLines(conGoogle)
close(conGoogle)
length(vcsGoogle)
head(vcsGoogle,10)
vcsGoogle[7]
#
#
#
#
#
#
#
#
#

Control Structures
Control Structures if, else (Testing a condition)
Control Structures for (execute a loop fixed number of times)
Control Structures while (execute a loop while a condition is true)
Control Structures repeat (execute an infinite loop)
Control Structures break (break the execution of the loop)
Control Structures skip (skip an iteration of a loop)
Control Structures return (exit a function)
User Defined Function

# simple function without return statement


addNumbers <- function(numA, numB) {
numSum <- numA + numB
numSum }
addNumbers(1,2)
# simple function with two return statement
addNumbers <- function(numA, numB=0) {
if ( (numA==0) && (numB==0) ) {
return (NA) }
numSum <- numA + numB
return (numSum)
}
addNumbers(0)
addNumbers(1)
addNumbers(1,2)

Session-3 (Writing data / Date-time functions)


# write csv
write.csv(dfr,"filename.csv",row.names=F)
file.show("filename.csv")
# write lines after reading text file
vcsUNProfile <- readLines("G:/NMIMS/Sem 1/R/R-BA/data/un-profile.txt")
vcsUNProfile <- substr(vcsUNProfile,1,50)
writeLines(vcsUNProfile,"filename.txt")
file.show("filename.txt")

# write rds (R Data Set)


dfr <- data.frame(foo=1:100, bar=x <- runif(10, min=0, max=10), buf=rnorm(100))
saveRDS(dfr,"filename.rds")
file.show("filename.rds")
# date yyyy-mm-dd hh:mm:ss
datX <- as.Date("2015-10-01 13:45:32")
class(datX)
# POSIXlt yyyy-mm-dd hh:mm:ss
xltX <- as.POSIXlt("2015-10-01 13:45:32")
class(xltX)
# POSIXct yyyy-mm-dd hh:mm:ss
xctX <- as.POSIXct("2015-10-01 13:45:32")
class(xctX)
Sys.time()
class(Sys.time())
# extract date / time using format()
# standard date format
format(Sys.time(), "%c")
# mm/dd/yy format
format(Sys.time(), "%D")
# yyyy-mm-dd - iso 8601 format
format(Sys.time(), "%F")
# day of week
format(Sys.time(), "%a")
format(Sys.time(), "%A")
# day of month
format(Sys.time(), "%d")
# month
format(Sys.time(), "%m")
format(Sys.time(), "%b")
format(Sys.time(), "%B")
# year
format(Sys.time(), "%y")
format(Sys.time(), "%Y")
# full date
format(Sys.time(), "%a %d-%b-%Y")
format(Sys.time(), "%a %d-%m-%Y")

# extract date / time using format()


format(Sys.time(), "%X")
# time to second accuracy
format(Sys.time(), "%H:%M:%S")
# time to sub-second accuracy (if supported by the OS)
format(Sys.time(), "%H:%M:%OS3")
# locale-specific version of date / time
format(Sys.time(), "%a %b %d %Y %X %Z")

Session-4 (Data manipulation using dplyr)

install.packages('dplyr')
install.packages('tidyr')
install.packages('data.table')
dfrNifty <- read.csv("G:/NMIMS/Sem 1/R/R-BA/data/nifty-data.csv", header=T,
stringsAsFactors=F)
dfrNifty <- data.table(dfrNifty)
# filter
dfrNifty.ACC <- filter(dfrNifty, Symbol == "ACC")
nrow(dfrNifty.ACC)
View(dfrNifty.ACC)
# subset of rows based on AND condition
dfrNifty.Filt <- filter(dfrNifty, (dfrNifty$DateDate >= "2014-12-01" & dfrNifty$DateDate <=
"2014-12-05") )
# subset of rows based on OR condition
dfrNifty.Filt <- filter(dfrNifty, (dfrNifty$DateDate == "2014-12-01" | dfrNifty$DateDate ==
"2014-12-05") )
# subset of rows based on "starts-with" condition ... start search string with ^
dfrNifty.Tmp <- slice(dfrNifty, 1:50)
dfrNifty.Filt <- filter(dfrNifty.Tmp, grepl("^TATA",dfrNifty.Tmp$NameOfTheSecurityInNse))
# subset of rows based on "ends-with" condition ... end search string with $
dfrNifty.Tmp <- slice(dfrNifty, 1:50)
dfrNifty.Filt <- filter(dfrNifty.Tmp, grepl("LTD$",dfrNifty.Tmp$NameOfTheSecurityInNse))
# subset of rows based on "contains" condition ... end search string with +
dfrNifty.Tmp <- slice(dfrNifty, 1:50)
dfrNifty.Filt <- filter(dfrNifty.Tmp, !grepl("BANK+",dfrNifty.Tmp$Symbol))
# subset of rows by position or row-range ... last 10
fr <- as.integer(nrow(dfrNifty)-9)
to <- nrow(dfrNifty)
dfrNifty.Slcd <- slice(dfrNifty, fr:to)

You might also like