0% found this document useful (0 votes)

60 views13 pages

R Studio

This document summarizes key R programming concepts including vectors, matrices, data frames, and lists. It shows how to create, manipulate, and perform operations on various data types in R. Functions like c(), vector(), matrix(), data.frame() are used to construct different data structures. Operations like subsetting, binding, arithmetic, and coercion between different types are demonstrated.

Uploaded by

N K

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

60 views13 pages

R Studio

Uploaded by

N K

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 13

rm(list=ls()) ##create vectors x <- 1:20

#always creates an integer vector

#Assign a variable with an x
integer value a <- 10L a #class and length of a
is.integer(a) #to check whether the value is vector class(x)
integer or not length(x)

#using c() x <- c(0.1,0.2)

#character type
##numeric vector x <-
str <- 'R
c(TRUE,FALSE) ##logical
programming'
vector x <- c(T,F) ##logical
str s <-
vector x <- c("A","B","C")
"cse3505 -"
##character vector x <-
s class(s)
c(1L,2L,15L,27L) ##integer
#some useful functions vector x x <- c(1+2i,3)
paste(s,str) sprintf("%s has ##complex vector
scored %d marks","Sita",90)
#using
substr(str,start=5,stop=10)
vector() x
sub("e","C",str) <- vector()
str x length(x)
print(str) class(x)

#complex type x <-

cmp <- 21+10i vector("character",length
sqrt(-1) sqrt(-1+0i) = 10) x
sqrt(as.complex(-1)) #explicit type #Implicit type coercion -
conversion mixed objects y <- c(1.5,"a")
#character y y <- c(1.5,TRUE)
#numeric y

#logical type lg
<- TRUE y <- c(TRUE,"a")
p=TRUE;q=FAL #character y
SE
p&q;p|q;!p #Explicit type
coercion x <- 2.5
#Obtain the class and type of the variable class(x)
as.integer(x) x
class(a)
typeof(a) x <- -1:5 x
class(str) class(x)
typeof(str) as.numeric
class(cmp) (x)
typeof(cmp) as.logical(x)
class(lg) as.characte
typeof(lg) r(x)
as.complex
#special number Inf representing infinity
(x)
1/0
1/Inf log(0) #find
natural log.
#Non-sensical coercion results in NAs
#you can represent base value as 2nd x <- c('a','b','c')
argument log(10,2) #base 2 log(10,10) x as.
#base 10 numeric(x)
as.logical(x)
#NaN represents a undefined value (also indicates a missing value)
0/0 #vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y #missing values x
x-y x*y x/y <-
help(options c(1,2,NA,5,NaN,6)
) ?options is.na(x) is.nan(x)
options(digit
# Data frame ----------------------------------------------
s=2)
-------------------rm(list=ls())
#recycling
# table with the same type within a column and different types between columns #
rule y <-
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
c(2,4,6,8,10)
sample_df=data.frame(id,name,marks) sample_df
x+y
my_df <- data.frame(id = c(1, 2, 3),
#create
name = c("Ramu","Raju","Ravi"),
matrices m <- marks = c(50, 40, 25))
matrix() my_df
m
#dimension of the data frame
m <- matrix(nrow=3,ncol=2) dim(my_df)
m
attributes(m) dim(m) m <- matrix() m <- #columns of the data frame
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m names(my_df)
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise #structure of the data frame
m str(my_df)

#summary statistics of the data frame

#constructing from summary(my_df)
vector m <- 1:6
head(my_df) #top 6 rows in the data
dim(m) <- c(3,2) frame tail(my_df) #bottom 6 rows in the
m data frame

#constructing using ################ ADDING/Removing columns

column-binding x <- 1:3 x y # Ways to add a column
<- 10:12 y cbind(x,y)
my_df
#constructing using row- #initialize with 0
binding rbind(x,y) my_df$name
#matrix multiplication x <- my_df$perf <- 0
matrix(c(1,2,3,4),nrow=2,ncol=2) my_df
y <-
matrix(c(10,10,10,10),nrow=2,n my_df$perf <- c("very good","good","needs
col=2) x y x*y #does element- to improve") my_df
wise multiplication x%*%y
#can use [[]],[],[,] my_df[["perf"]] <-c("very
#does matrix multiplication
good","good","needs to improve")
#similarly, use x%/%y for matrix division. Otherwise, it does element-wise division my_df["perf"] <- c("very
good","good","needs to improve")
x t(x) #transpose of a my_df[,"perf"] <- c("very
matrix solve(x) #inverse good","good","needs to improve") my_df[5]
of a matrix det(x) # <- 0 my_df
determinant of a matrix

#creating a List x <- # Ways to remove the column

list(1,'a',TRUE,1+3i,6.7,c(10,20,
my_df[5] <- NULL
30)) x
my_df$V5 <- NULL
my_df my_df$perf
#factors x <- <- NULL
factor(c("male","female")) x x <- my_df[["perf"]] <-
factor(c("low","medium","high", NULL
"low")) table(x) unclass(x) my_df["perf"] <-
NULL my_df[5] <- boolv <-
NULL my_df$V5 <- stu_temp["mark2"]>10
NULL #subsetting boolv
row.names(stu_temp)[bo
df1 <- subset(my_df, olv]
select=c(id,marks)) df1 df1 <- #---------------------------------------------------------------
-----
subset(my_df, select=-marks)
library(help=datas
df1 View(df1) ets)

my_df$mark2 <- data(mtcars) # Loading mtcars

c(30,20,10) my_df data set cars <-mtcars # Save the
data into workspace
#sum of all marks
sum(my_df$mark2) # Viewing data set mtcars
# Total data set in console
#rowsum View(mtcars) # Viewing dataset in
my_df$total <- spreadsheet
rowSums(my_df[c(3,5)]) my_df
head(mtcars) # Viewing top-6 observations (default:
#max
top-6) tail(mtcars) # Viewing bottom 6
max(my_df$total)
observations str(mtcars) # Viewing data
#index at which max value is present
which.max(my_df$total) dictionary names(mtcars) # Viewing column
names v1 <- mtcars$mpg # Assigning single variable
#name of the student who got the from mtcars data to v1 v2 <- mtcars$cyl v3 <- mtcars$disp
max mark v4 <- mtcars$hp newvar <- mtcars$disp + mtcars$hp
my_df[["name"]][which.max(my_d
f$total)] mtcars1<-rbind(v1,v2,v3,v4) # Combined as rows
my_df$name[which.max(my_df$to #Horizontal joins mtcars1 mtcars2<-
tal)] cbind(v1,v2,v3,v4) # Combined as columns # Vertical
my_df[which.max(my_df$total),2] joins mtcars2

my_df my_df <- #create a variable obs_subset and have rows 4

rbind(my_df,data.frame(id=4,name="avgscore",marks=mean(my_df$marks),perf="meanperf",mark2=mean(my_df$mark to 10 in mtcars obs_subset <- mtcars[4:10,]
2),total=mean(my_df$total))) getwd() obs_subset

write.csv(my_df,"marks1.csv") #create a variable var_subset and have only the

write.csv(my_df,"marks.csv",row.names = FALSE) columns 1,5,9 var_subset <- mtcars[,c(1,5,9)]
var_subset
#R datasets
#subsetting
stu_marks <- read.csv("marks.csv") #create a variable subset1 and have only mpg and cyl variables of mtcars
str(stu_marks) #using indexing subset1 <-
mtcars[,c(1,2)] head(subset1,3)
stu_marks
#using subset() subset2 <-
stu_marks$mark2
subset(mtcars,select=c(mpg,cyl))
stu_marks[4]
subset2
stu_marks[3,3]
stu_marks[3,5] #create a variable subset3 and have only the rows
stu_marks where mpg>18 subset3 <- subset(mtcars,mpg>18)
stu_marks[c(1,3),c( subset3
2,5)]
#create a variable subset3 and have only the rows where
stu_temp <- stu_marks[c(-2,-4)] mpg>18 and cyl>5 subset4 <- subset(mtcars, mpg>18
stu_temp &cyl>5) subset4

row.names(stu_temp) #exclude mpg and cyl columns subset4 <-

row.names(stu_temp) <- subset(mtcars, mpg>18 &cyl>5, select=c(-mpg,-
stu_marks$name stu_temp cyl)) subset4
#install.packages("M #tbl_df(cars)
ASS") library(MASS) as_tibble(cars)
data("survey")
glimpse(cars)
#clear workspace
rm(list=ls()) ############ Subsetting Rows (Observations) #################

#filtering based on single

loan <- read.csv("loans data.csv") loan <- condition filter(cars, mpg>25)
read.table("loans data.csv",header = TRUE,sep
= ",") #filtering based on multiple
condition filter(cars, mpg>25&
dim(loan) hp >90)
str(loan)
head(loan,3) #Remove duplicate rows
tail(loan,2) distinct(cars)
summary(loan)
any(is.na(loan)) #Randomly select fraction of rows
sum(is.na(loan)) sample_frac(cars,0.2)

loan_cln <- #Randomly select no. of rows

na.omit(loan) sample_n(cars,5)
nrow(loan)
nrow(loan_cln) #selecting rows by position
slice(cars,11:15)
loan_cln2 <-
loan[complete.cases(loan),] slice_sample(cars,n=5)
nrow(loan_cln2)
temp <-
#Loading the dplyr package
library(dplyr) filter(cars,mpg>25)

#loading slice_sample(temp,n
data
data("mtca =2)
rs") cars <-
mtcars cars %>%
filter(mpg>25) %>%
#dimension of the data
slice_sample(n=2)
dim(cars)
#unique values in a column
#structure of the data unique(cars$cyl)
str(cars)
#no. of values under each unique category
#is.na(cars) #NA or NaN table(cars$cyl)
#checking for missing
#grouping
values any(is.na(cars))
cars %>%
sum(is.na(cars))
group_by(cyl)%
#################### Viewing data ########################
>%
#fetching top 6 rows slice_sample(n
head(cars) =2)

#fetching last 6 rows

############ Subsetting Columns (variables) #################
tail(cars)
#selecting single column
#viewing data
dplyr::select(cars,mpg)
View(cars)
cars %>%
#summary
dplyr::select(mpg)%>%
summary(cars) head(3)
#slice_sample(n=3)
cars
#selecting multiple columns dplyr::select(mpg,disp,hp,newv
dplyr::select(cars,mpg,cyl,gear) ar2)%>% sample_n(2)
dplyr::select(cars,c("mpg","cyl","ge
ar")) ############ summarizing data
names(cars) #################
#select all columns between a range of columns #Always group_by is used along with summarise. It is applied on categorical value
(inclusive) dplyr::select(cars,hp:am) cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values
#combining filter and select- using pipe operator table(cars$cyl)
cars %>%
filter(mpg>18)%>% #computing max, min and standard dev cars %>% group_by(cyl) %>%
dplyr::select(mpg,cyl)%>% summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
head(3) mpg),md=median(mpg))

names(cars) #clear workspace

#selecting columns starting with 'd' rm(list=ls())
dplyr::select(cars,starts_with('d'))
loan <- read.csv("loans data.csv") loan <-
#selecting columns ending with 't'
read.table("loans data.csv",header = TRUE,sep
dplyr::select(cars,ends_with('t'))
= ",")
#selecting columns
dim(loan)
containing 'g'
str(loan)
dplyr::select(cars,contains('g
head(loan,3)
')) #selecting columns
matching regular expression tail(loan,2)
dplyr::select(cars,matches('.. summary(loan)
a.')) any(is.na(loan))
sum(is.na(loan))
#Excluding certain columns
select(cars,c(-mpg,-cyl)) loan_cln <-
na.omit(loan)
############ Arranging data nrow(loan)
################# nrow(loan_cln)

#arrange the data in ascending order of mpg loan_cln2 <-

arrange(cars,mpg) loan[complete.cases(loan),]
nrow(loan_cln2)
#arrange the data in descending order of mpg
arrange(cars,desc(mpg)) #Loading the dplyr package
library(dplyr)
#arrange the data in order based on more than
one column arrange(cars, mpg,disp) #loading
data
arrange(cars,mpg,desc(disp)) data("mtca
rs") cars <-
############ Making new variables ################# mtcars

#creating a new column #dimension of the data

mutate(cars,newvar=disp-hp ) dim(cars)

#combining functions
#create a new variable that sum up disp and hp #structure of the data
and filter only str(cars)
#the rows where mpg>25 & #is.na(cars) #NA or NaN
disp>90 #and select only mpg, #checking for missing values
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>% any(is.na(cars))
filter(mpg>25,disp>90)%>% sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows slice_sample(n
head(cars) =2)

#fetching last 6 rows

tail(cars) ############ Subsetting Columns (variables) #################

#viewing data #selecting single column

View(cars) dplyr::select(cars,mpg)

#summary cars %>%

summary(cars) dplyr::select(mpg)%>%
head(3)
cars #slice_sample(n=3)

#tbl_df(cars) #selecting multiple columns

as_tibble(cars) dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
glimpse(cars) ar"))
############ Subsetting Rows (Observations) ################# names(cars)
#filtering based on single #select all columns between a range of columns
condition filter(cars, mpg>25) (inclusive) dplyr::select(cars,hp:am)

#filtering based on multiple

condition filter(cars, mpg>25& #combining filter and select- using pipe operator
hp >90)
cars %>%
#Remove duplicate rows filter(mpg>18)%>%
distinct(cars) dplyr::select(mpg,cyl)%>%
head(3)
#Randomly select fraction of rows names(cars)
sample_frac(cars,0.2)
#selecting columns starting with 'd'
#Randomly select no. of rows dplyr::select(cars,starts_with('d'))
sample_n(cars,5)
#selecting columns ending with 't'
#selecting rows by position dplyr::select(cars,ends_with('t'))
slice(cars,11:15)
#selecting columns containing 'g'
slice_sample(cars,n=5) dplyr::select(cars,contains('g'))

#selecting columns matching regular expression

temp <-
dplyr::select(cars,matches('..a.'))
filter(cars,mpg>25) #Excluding certain columns
select(cars,c(-mpg,-cyl))
slice_sample(temp,n
############ Arranging data
=2) #################

cars %>% #arrange the data in ascending order of mpg

filter(mpg>25) %>% arrange(cars,mpg)
slice_sample(n=2) #arrange the data in descending order of mpg
#unique values in a column arrange(cars,desc(mpg))
unique(cars$cyl) #arrange the data in order based on more than
#no. of values under each unique category one column arrange(cars, mpg,disp)
table(cars$cyl) arrange(cars,mpg,desc(disp))
#grouping
cars %>% ############ Making new variables #################
group_by(cyl)%
>%
#creating a new column M1 <- A1[,,1]
mutate(cars,newvar=disp-hp ) M2 <- A1[,,2]
M3 <- M1+M2
#combining functions
M3
#create a new variable that sum up disp and hp
and filter only M1
#the rows where mpg>25 & #Aggregation on array
disp>90 #and select only mpg, elements apply(M1,1,sum)
disp, hp, newvar cars %>% #1- along row
mutate(newvar2=disp+hp)%>% apply(M2,2,sum) #2 -along
filter(mpg>25,disp>90)%>% column A1
dplyr::select(mpg,disp,hp,newv apply(A1,1,sum)
ar2)%>% sample_n(2) apply(A1,2,mean)

rm(list=ls())
############ summarizing data
################# #To create date / To
#Always group_by is used along with summarise. It is applied on categorical value represent date d <- date()
cars %>% group_by(cyl) %>% d class(d)
summarize(cnt=n()) #count of unique
#as.Date(d)
cyl values
#to convert date string to date class d <-
table(cars$cyl) as.Date("2022-8-25") #default format -year-
month-day class(d)
#computing max, min and standard dev cars %>% group_by(cyl) %>% d as.Date("2022-8-25
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean( 10:44:22")
mpg),md=median(mpg)) as.Date("2022-8-25
21:15")
rm(list=ls())

#creating array from #to see the internal representation

vectors v1 <- c(1,2,3) unclass(d)
v2 <- c(4,5,6,7,8,9)
#to represent both date and
A1 <- array(c(v1,v2),dim = c(3,3,2)) time as.POSIXct("2022-8-25")
A1 pd <- as.POSIXct("2022-8-25
21:15") pd
#naming columns and rows class(pd)
rname <- c("r1","r2","r3") cname unclass(pd)
<- c("c1","c2","c3") mname <-
pd <- as.POSIXlt("2022-8-
c("mat1","mat2") dimnames(A1)
25") pd
<- list(rname,cname,mname)
class(pd)
A1 <- array(c(v1,v2),dim = c(3,3,2),dimnames = list(rname,cname,mname))
A1 #getting meta using
unclass() unclass(pd)
#printing the second row of second matrix names(unclass(pd))
A1[2,,2]
pd <- as.POSIXlt("2022-8-17
A1["r2",,"mat2"]
21:15:30") pd$sec pd$hour
#printing the second column of first matrix
pd$min pd$mday pd$year
A1[,2,1] unlist(pd)
A1[,"c2","mat1"] #if format is different
#printing the element in the 2nd row and 3rd column of second matrix as.Date("25/8/2022",format="%d/%
A1[2,3,2] m/%Y") date() as.Date("August
25,2022",format="%B %d,%Y")
as.Date("25Aug22",format="%d%b%
#printing the second matrix y")
A1[,,2]
A1[,,"mat2"] #Checking the class
class(as.Date("2022-8-25 21:15"))
#Manipulating array elements class(as.POSIXct("2022-8-25
21:15")) class(as.POSIXlt("2022-8- # dir.create("data")
25 21:15"))
#fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
#download.file(fileurl,destfile = "E:/sweetlin-personal/coursera/data/camera.csv")
#Getting date, time and #list.files("E:/sweetlin-personal/coursera/data")
zone p <- Sys.Date() #only
current date class(p) #dateofdownload <- date()
Sys.time() #current date, time and timezone #dateofdownload
Sys.timezone() rm(list=ls())
#Reading flat file using read.table() loan <-
#difference in dates Sys.Date()-
read.table("loans data.csv",header = TRUE,sep
as.Date("1979-03-21")
= ",") str(loan) head(loan,2)
difftime(Sys.Date(),as.Date("1979-03-
21"),units = "weeks") #getting weekdays and #Reading flat file using read.csv() loan1 <-
basic arithmetic d <- as.Date("2022-8-17") d
#to find weekday of the date read.csv("loans data.csv") str(loan1) df <-
weekdays(d) read.table("tabsepfile.txt",header =

#add or subtract to create new date(s) FALSE,sep = "\t") str(df)

d+1 d+1:5
weekdays(d+ df <- read.table("slashsepfile.txt",header = FALSE,sep="/",strip.white = TRUE,na.strings = "EMPTY")
1:5) str(df)

#check for seq #Reading Excel file

and rep #using #you need to import xlsx package
sequence d #install.packages("xlsx")
dt <- seq(d,by="2 #library(xlsx)
months",length.out = 6) dt #loan <- read.xlsx("loan.xls",sheetIndex=1,
header=TRUE)
#getting month and
quarter months(d) #install.packages("XLConnect")
months(dt) #library(XLConnect)
quarters(dt)
library(readxl)
#lubridate::today() #lubridate package #excel_sheets('E:/sweetlin-official/FALL 2020 -2021/CSE3505/R
#ISOdate(2021,8,25) programs/loans data.xlsx') excel_sheets("loans data.xlsx") df <-
read_excel("loans data.xlsx",sheet="sample")
?strptime
help("strptime") str(df)

datestring<-"August 17, 2022 04:20"

convertedForm<- #XLConnect, XLSX, readxl
strptime(datestring,"%B %d, %Y %H:%M")
class(convertedForm) convertedForm #Reading XML file
#You need to install XML
x <- as.Date("2020-01-01") y <- strptime("25 Aug package
2020 09:00:00", "%d %b %Y %H:%M:%S") x-y install.packages("XML")
class(x) library(XML)
#library(methods)
class(y)
#install.packages("RCurl") #library(RCurl)
x <- as.POSIXlt(x) library(httr) fileurl <-
x-y "https://www.w3schools.com/xml/simpl
e.xml" xmldata <- GET(fileurl) doc <-
#different time zones x <- xmlTreeParse(xmldata,useInternal=TRUE)
as.POSIXct("2021-08-25
08:00:00") x root <-
xmlRoot(doc)
xgmt<-as.POSIXct("2021-08-25 08:00:00", root
tz="GMT") xgmt xmlName(root)
names(root)
xgmt-x
#Accessing parts of xml file in the same
#if(!file.exists("data")) way as list root[[1]] #accessing 1st
food root[[1]][[1]] #accessing name of #view of the data in a table
the 1st food View(loan)

#Extracting parts of XML file- value of all nodes

xmlSApply(root,xmlValue) #fetching top 6 rows
head(loan)
root <- xmlSApply(root,function(x) xmlSApply(x,xmlValue))
#fetching last 6 rows
root tail(loan)
#Extracting individual nodes of #summary of the data
XML file summary(loan)
xpathSApply(root,"//name",xmlVal
ue) ############ Cleaning data
xpathSApply(root,"//price",xmlVal #################
ue) #checking for missing values in the data
any(is.na(loan)) #NA NaN
xml_df <- data.frame(t(root),row.names =
NULL) str(xml_df)
#checking for the total no. of missing values in
the data sum(is.na(loan))
#Reading JSON file #cleaning NA values
#Loading jsonlite loan_clean <-
package na.omit(loan)
library(jsonlite) jdata <-
fromJSON("https://api.github.com/users/jtleek sum(is.na(loan_clean)) str(loan_clean)
/repos") names(jdata) loan_clean1 <- loan[complete.cases(loan),]
class(jdata) str(jdata) #boolean indexing sum(is.na(loan_clean1))
head(jdata,2)
#Extracting nested #imputation - filling the missing values
objects #cleaning Amount.Requested Column
names(jdata$owner) #checking for the total no. of missing values in a particular column
jdata$owner$login
sum(is.na(loan$Amount.Requested)
)

unique(loan$Amount.Requested)
#writing to json file
data(iris) str(iris) #changing to numeric types
head(iris,2) jfile <- loan$Amount.Requested <-
toJSON(iris,pretty = TRUE) as.integer(loan$Amount.Requested)
cat(jfile) str(loan)

#reading json file #unique values in a column

irisdata <- unique(loan$Amount.Requested)
fromJSON(jfile)
mean(loan$Amount.Requested,na.rm = TRUE)
head(irisdata)
median(loan$Amount.Requested,na.rm = TRUE)
#clear workspace
#library(dplyr)
rm(list=ls())
library(tidyverse)
############ Reading data #Decide whether to impute with mean or median loan %>%
################# summarize(avg=mean(Amount.Requested,na.rm =
#using read.table() loan_data <- read.table("loans TRUE),med=median(Amount.Requested,na.rm = TRUE))
data.csv",header = TRUE,sep = ",") loan <- loan <- loan %>%
read.csv("loans data.csv")
mutate(Amount.Requested=replace(Amount.Requested,is.na(Amount.Requested),median(Amount.Req
#dimension of the data uested,na.rm = TRUE)))
dim(loan)
sum(is.na(loan$Amount.Requested)
#structure of the data )
str(loan)
#Rename a column loan <- loan%>%
loan %>% filter(is.na(Loan.Length))
rename(Amt_Req=Amount.Re
quested) names(loan) #drop the rows with
str(loan) NA values loan <-
#cleaning Amount.Funded.By.Investors column loan%>%
sum(is.na(loan$Amount.Funded.By.Investors)) drop_na(Loan.Length)

unique(loan$Amount.Funded.By.In #checking
vestors) str(loan) sum(is.na(loan$Loan.Length))
loan <- loan%>%
unique(loan$Loan.Length)
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <- #cleaning Employment.Length column
as.numeric(loan$Amt_fund) sum(is.na(loan$Employment.Length))

#checking for NA values unique(loan$Employment.Length)

sum(is.na(loan$Amt_fund))
loan <- loan %>% mutate(Employment.Length=gsub(" year|
#check impute with mean or years|< |\\+","",Employment.Length))
median loan%>%
loan$Employment.Length <- as.integer(loan$Employment.Length)
summarize(avg=mean(loan$Amt_f
und,na.rm = #checking
TRUE),md=median(loan$Amt_fund, unique(loan$Employment.Length)
na.rm = TRUE)) sum(is.na(loan$Employment.Len
gth))
loan <- loan%>%
mutate(Amt_fund=replace(Amt_fund,is.na(Amt_fund),median(Amt_fund, table(loan$Employment.Length)
na.rm = TRUE))) mean(table(loan$Employment.Len
gth))
sum(is.na(loan$Amt_fund))
loan <- loan%>%
str(loan)
mutate(Employment.Length=replace(Employment.Length,is.na(Employment.Len
#cleaning Interest.Rate column
gth),2))
sum(is.na(loan$Interest.Rate))
#checking
#cleaning unwanted substring in a chr
sum(is.na(loan$Employment.Lengt
column loan <- loan %>%
h))
mutate(Interest.Rate=gsub("%","",Int
unique(loan$Employment.Length)
erest.Rate))
head(loan$Interest.Rate,2) #cleaning FICO.Range column
head(loan$FICO.Range,2) loan <-
loan$Interest.Rate <- as.numeric(loan$Interest.Rate)
loan %>%
head(loan$Interest.Rate,2)
separate(FICO.Range,c("fico-
loan$Interest.Rate <- low","fico-high")) str(loan)
as.numeric(loan$Interest.Rate) str(loan) names(loan)

loan$`fico-high` <-
#cleaning Loan.Length column as.integer(loan$`fico-high`)
sum(is.na(loan$Loan.Length)) loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
unique(loan$Loan.Length)
sum(is.na(loan$`fico-high`))
loan <- loan %>% sum(is.na(loan$`fico-low`))
mutate(Loan.Length=gsub(" unique(loan$`fico-high`)
months","",Loan.Length)) unique(loan$`fico-low`)
#statistical analysis - Numerical measure
loan$Loan.Length <- as.integer(loan$Loan.Length) str(faithful) #faithful - built-in data
head(faithful)
sum(is.na(loan$Loan.Length))
#Central tendency measure
unique(loan$Loan.Length)
mean(faithful$eruptions)
#filtering the rows with NA values #median
median(faithful$eruptions) #relative frequency relfreq
<-
#Measure of dispersion Interval_freq/nrow(faithful)
range(faithful$eruptions) old=options(digits = 2)
max(faithful$eruptions)- cbind(Interval_freq,relfreq)
min(faithful$eruptions)
#quartile #cumulative frequency
quantile(faithful$eruption cumfreq <-
s) cumsum(table(interval))
cumfreq cbind(cumfreq)
#Inter-quartile range
IQR(faithful$eruptions) rm(list=ls())
library(help=graphi
#percentile cs)
quantile(faithful$eruptions,c(.27,.3 data("airquality")
5,.65)) str(airquality)
#variance #to set the margin
var(faithful$eruptions) par(mar=c(2,2,2,2))
#standard deviation #1D scatter plot
sd(faithful$eruptions) plot(airquality$Ozone)
#covariance #2D scatter plot
cov(faithful$eruptions,faithful$wai
ting) plot(airquality$Ozone,airquality$W
ind)
#correlation
cor(faithful$eruptions,faithful$wai ?plot
ting)
#type argument in plot
#moment -third central moment plot(airquality$Ozone,type="l")
# the second central moment of a population
#title and axis labels arguments
is its variance library(e1071)
plot(airquality$Ozone,main = "ozone levels",xlab =
moment(faithful$eruptions,3, center = TRUE)
"index",ylab = "ozone")
#skewness
skewness(faithful$eruptions)
#histogram
#kurtosis hist(airquality$Solar.R)
kurtosis(faithful$eruptions)
#boxplot
#frequency summary(airquality$Ozone)
distributio #step1 - boxplot(airquality$Ozone)
find range
range(faithful$erupti #multiple boxplot
ons) boxplot(airquality[,1:4],main="multiple
box plots")
#step2 - Break the range into non-overlapping sub-intervals by defining a sequence of
equal distance break points. breaks <- seq(1.5,5.5,by=0.5) breaks
#pie chart
#step3- Classify the eruption durations according to the half-unit-length sub- unique(airquality$Wind)
intervals with cut. interval <- cut(faithful$eruptions,breaks,right=FALSE) table(airquality$Wind)

#step 4 - Compute the frequency of eruptions in each sub-interval with the table function. wind_freq <- table(airquality$Wind)
Interval_freq = wind_above8 <- wind_freq>8
table(interval) wind_freq wind_above8
Interval_freq wind_above8data <-
cbind(Interval_freq) wind_freq[wind_above8]
wind_above8data
table(wind_above8)
pie(wind_above8data,radius=1) #scatter plot - multiple variables through both color and shape
par(mar=c(1,1,1,1)) ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)
#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n") ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
plot(airquality$Ozone) size=1.5)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l') #scatter plot- adding best fit line ggplot(mtcars,
barplot(airquality$Ozone, main = 'Ozone levels', ylab aes(x=wt,y=mpg))+geom_point()+geom_smooth(method="
= 'ozone value') hist(airquality$Solar.R) lm")
boxplot(airquality$Ozone)
###########bar plot ########### ggplot(mtcars,
aes(x=gear_factor))+geom_bar() ggplot(mtcars,
#lattice graph aes(x=gear_factor,fill=gear_factor,color="red"))+geom_bar() +ggtitle("frquency
library(lattice) plot of gear")
#density plot #flipping the bar direction ggplot(mtcars,
densityplot(airquality$Ozone) aes(x=gear_factor))+geom_bar()+coord_flip()
#scatter plot matrix #bar plot for 2 variables ggplot(mtcars,
splom(airquality[c(1,3,4)]) aes(x=cyl_factor,fill=gear_factor))+geom_bar(position='sta
ck')
#scatter plot depicting the combination
#################### pie chart ############ ggplot(mtcars,
of 2 variables data("mtcars") df <-
aes(x="",y=mpg,fill=cyl_factor))+geom_bar(width =
mtcars
1,stat='identity')+coord_polar("y",start = 0)
str(df)
par(mar=c(4,4.5,1, #################### histogram ###########
1)) ggplot(mtcars,aes(x=hp))+geom_histogram()+labs(title = "Distribution of
plot(df$wt,df$mpg) hp",y='frequency')

unique(df$cyl) cyl_factor <- factor(df$cyl,levels = #setting bin size ggplot(mtcars,aes(x=hp))+geom_histogram(bins =

c(4,6,8),labels = c("4cyl","6cyl","8cyl")) 3)+labs(title = "Distribution of hp",y='frequency')

unique(df$gear) gear_factor <- factor(df$gear,levels = #setting bin width ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =

c(3,4,5),labels = c("3 gears","4 gears", "5 gears")) 30)+labs(title = "Distribution of hp",y='frequency')

xyplot(df$mpg~df$wt|cyl_factor*gear_factor,main="scatter plots: Cylinders and Gears",xlab = "weight of #with border and fill color ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =
car",ylab = "miles per gallon") gear_factor 30,color='green',fill='yellow')+labs(title = "Distribution of hp",y='frequency')

freq_gear <- table(gear_factor) freq_gear #facets ggplot(mtcars,aes(x=hp))+geom_histogram(color="white",fill="blue")+labs(title = "Distribution of

barplot(freq_gear,col=c("red","green","blue")) hp",y='frequency')+facet_wrap(cyl_factor,ncol=1)
pie(freq_gear,labels=c("3 gears","4 gears", "5
gears"),col=c("red","green","blue"),radius=1)
################ Kernel density curve ############ ggplot(mtcars,
rm(list=ls()) aes(x=hp))+geom_density()+labs(title="Distribution of hp",x="horse
data("mtcars") power",y='density')

#install.packages("ggplot2") library(ggplot2) #with fill color ggplot(mtcars,

head(mtcars,2) #scatter plot ggplot(data=mtcars, aes(x=hp))+geom_density(fill='blue',color='red')+labs(title="Distribution of
mapping=aes(x=wt,y=mpg))+geom_point() hp",x="horse power",y='density')
unique(mtcars$cyl) cyl_factor <-
############## Line plot ###############
factor(mtcars$cyl,levels = c(4,6,8),labels =
library(dplyr) d <-
c("4cyl","6cyl","8cyl"))
sample_n(mtcars,10)
unique(mtcars$gear) gear_factor <- factor(mtcars$gear,levels = ggplot(d,aes(x=wt,y=drat))+ge
c(3,4,5),labels = c("3 gears","4 gears", "5 gears")) om_line() d

#scatter plot - multiple variables through color #with varied thickness and color points
ggplot(mtcars,aes(x=wt,y=mpg,color=gear_factor))+geom_point() ggplot(d,aes(x=wt,y=drat))+geom_line(aes(size=2,color='red'))+geom_point(aes(size=2,color='blue'))

#scatter plot - multiple variables through size ################# box plot

ggplot(mtcars,aes(x=wt,y=mpg,size=qsec))+geo ################ ggplot(mtcars,
m_point() aes(x=mpg))+geom_boxplot()
#multiple box plots ggplot(mtcars,
aes(x=cyl_factor,y=mpg))+geom_boxplot()

Introduction To NFL Analytics With R (Bradley J. Congelio) (Z-Library)
No ratings yet
Introduction To NFL Analytics With R (Bradley J. Congelio) (Z-Library)
383 pages
R Graphics Essentials For Great Data Visualization 9781979748100 C
No ratings yet
R Graphics Essentials For Great Data Visualization 9781979748100 C
257 pages
Alese Wooditch - Nicole J. Johnson - Reka Solymosi - Juanjo Medina Ariza - Samuel Langton - A Beginner's Guide To Statistics For Criminology and Criminal Justice Using R-Springer Nature (2021)
No ratings yet
Alese Wooditch - Nicole J. Johnson - Reka Solymosi - Juanjo Medina Ariza - Samuel Langton - A Beginner's Guide To Statistics For Criminology and Criminal Justice Using R-Springer Nature (2021)
446 pages
Dplyr - Grammar of Data Manipulation
No ratings yet
Dplyr - Grammar of Data Manipulation
3 pages
R-Programming Record - Odd Sem 21-22
No ratings yet
R-Programming Record - Odd Sem 21-22
35 pages
Tidyverse - Tidyr and Dplyr
No ratings yet
Tidyverse - Tidyr and Dplyr
33 pages
R Programming Cheat Sheet
No ratings yet
R Programming Cheat Sheet
15 pages
205 R Prog MCQ
100% (1)
205 R Prog MCQ
48 pages
CH 03
No ratings yet
CH 03
42 pages
R Programming Cheatsheet
100% (2)
R Programming Cheatsheet
6 pages
Data Transformation With Dplyr - Cheatsheet
100% (1)
Data Transformation With Dplyr - Cheatsheet
2 pages
R For Machine Learning Lab Practical Work: Master of Business Administration in Business Analytics
0% (1)
R For Machine Learning Lab Practical Work: Master of Business Administration in Business Analytics
9 pages
Exploratory Data Analysis With R
No ratings yet
Exploratory Data Analysis With R
218 pages
Eda Lab Manual
No ratings yet
Eda Lab Manual
69 pages
R-Web-Appendix of Foundations of Statistics For Data Scientists
No ratings yet
R-Web-Appendix of Foundations of Statistics For Data Scientists
122 pages
R Programming Materials
No ratings yet
R Programming Materials
51 pages
Introduction To R
No ratings yet
Introduction To R
91 pages
List Matrix
No ratings yet
List Matrix
41 pages
Unit 4
No ratings yet
Unit 4
27 pages
Arunav Da Prac
No ratings yet
Arunav Da Prac
55 pages
Lab4 Instructions
No ratings yet
Lab4 Instructions
52 pages
Armillia Karenna - TP060327 - Pfda
No ratings yet
Armillia Karenna - TP060327 - Pfda
65 pages
Introduction To R
No ratings yet
Introduction To R
74 pages
Basic R Tutorial
No ratings yet
Basic R Tutorial
56 pages
DV Lab
No ratings yet
DV Lab
52 pages
R Program
No ratings yet
R Program
22 pages
R File Code
No ratings yet
R File Code
16 pages
DSR LAB MANUAL - 10 Programs
No ratings yet
DSR LAB MANUAL - 10 Programs
34 pages
KrutikaKolhe 862467252 HW5
No ratings yet
KrutikaKolhe 862467252 HW5
18 pages
Basic R Programming
No ratings yet
Basic R Programming
37 pages
Data Transformation 1 Reviewed
No ratings yet
Data Transformation 1 Reviewed
43 pages
Lab 02 - Compound Data Structures
No ratings yet
Lab 02 - Compound Data Structures
12 pages
RSTUDIO
No ratings yet
RSTUDIO
44 pages
R-Script 2
No ratings yet
R-Script 2
10 pages
My First Script.r
No ratings yet
My First Script.r
32 pages
R Tutorial #1: Applied Econometrics (Econ3005)
No ratings yet
R Tutorial #1: Applied Econometrics (Econ3005)
21 pages
Tutorial 1
No ratings yet
Tutorial 1
29 pages
A Short List of Some Useful R Commands: Input and Display
No ratings yet
A Short List of Some Useful R Commands: Input and Display
2 pages
Practical 1 - Basics of R
No ratings yet
Practical 1 - Basics of R
8 pages
Unit - 3 Learning Notes
No ratings yet
Unit - 3 Learning Notes
8 pages
Base R
No ratings yet
Base R
9 pages
R Programs 2024-2025
No ratings yet
R Programs 2024-2025
13 pages
R
No ratings yet
R
13 pages
Matrix, Dataframes, List
No ratings yet
Matrix, Dataframes, List
8 pages
Parsing Dates With Lubridate: Charlo e Wickham
No ratings yet
Parsing Dates With Lubridate: Charlo e Wickham
23 pages
Unit - 2: Data Manipulation With R & Data Visualization in Watson Studio
No ratings yet
Unit - 2: Data Manipulation With R & Data Visualization in Watson Studio
58 pages
Codes - Part 1
No ratings yet
Codes - Part 1
7 pages
R Study Material I
No ratings yet
R Study Material I
8 pages
Intro To Data Science Lecture 4
No ratings yet
Intro To Data Science Lecture 4
13 pages
Statistic and R Programming Lab Exercise
No ratings yet
Statistic and R Programming Lab Exercise
8 pages
Econ6067 R (I) 2022
No ratings yet
Econ6067 R (I) 2022
22 pages
R Reference Card
No ratings yet
R Reference Card
6 pages
Mini Project: Nutrition Calculator Calculate Nutrition For Recipes
No ratings yet
Mini Project: Nutrition Calculator Calculate Nutrition For Recipes
20 pages
R
No ratings yet
R
15 pages
DMPA Codes
No ratings yet
DMPA Codes
16 pages
Siv2010 Mathematics in Biology: Revision (Quiz 1) - R
No ratings yet
Siv2010 Mathematics in Biology: Revision (Quiz 1) - R
17 pages
X - 15 x-1 2. Print ('Hello Word!') ## (1) "Hello Word!" 3. X - 4 y - 5 Z - X+y Print (Z) 4. X - 4 y - 5 Cat ('The Sum of X and y Is', X+y)
No ratings yet
X - 15 x-1 2. Print ('Hello Word!') ## (1) "Hello Word!" 3. X - 4 y - 5 Z - X+y Print (Z) 4. X - 4 y - 5 Cat ('The Sum of X and y Is', X+y)
15 pages
Introduction To R: Nihan Acar-Denizli, Pau Fonseca
No ratings yet
Introduction To R: Nihan Acar-Denizli, Pau Fonseca
50 pages
COMP2501 - Assignment - 1 - Questions - RMD 2
No ratings yet
COMP2501 - Assignment - 1 - Questions - RMD 2
7 pages
(Tutorial) The 10 Most Important Packages in R For Data Science - DataCamp
No ratings yet
(Tutorial) The 10 Most Important Packages in R For Data Science - DataCamp
8 pages
DS Lab
No ratings yet
DS Lab
31 pages
Data Manipulation in R
No ratings yet
Data Manipulation in R
5 pages
Copy Entire Document Content in R Studio
No ratings yet
Copy Entire Document Content in R Studio
17 pages
R Program Record Book Iba
No ratings yet
R Program Record Book Iba
24 pages
Practical Assignment-10 Mini Project Nutrition Calculator - Calculate Nutrition For Recipes
No ratings yet
Practical Assignment-10 Mini Project Nutrition Calculator - Calculate Nutrition For Recipes
16 pages
Dba Midterm Cheatsheet
No ratings yet
Dba Midterm Cheatsheet
2 pages
R - Tutorial: Matrices Are Vectors
No ratings yet
R - Tutorial: Matrices Are Vectors
13 pages
Lesson 7 - The Data Frame
No ratings yet
Lesson 7 - The Data Frame
7 pages
R Studio Notes
No ratings yet
R Studio Notes
10 pages
Apply Functions With Purrr::: Cheat Sheet
No ratings yet
Apply Functions With Purrr::: Cheat Sheet
2 pages
Cluster R
No ratings yet
Cluster R
1 page
Data Transformation With Dplyr Cheat Sheet
No ratings yet
Data Transformation With Dplyr Cheat Sheet
2 pages
UL2
No ratings yet
UL2
2 pages
Importing The Files
No ratings yet
Importing The Files
14 pages
R Functions
No ratings yet
R Functions
8 pages
Exercise-9..Study and Implementation of Data Visulization With Ggplot
No ratings yet
Exercise-9..Study and Implementation of Data Visulization With Ggplot
1 page
WWWWWW WWWWWW WWWWWW WWWWWW WWWW WWWW WWWWWW: Data Transformation With Dplyr
No ratings yet
WWWWWW WWWWWW WWWWWW WWWWWW WWWW WWWW WWWWWW: Data Transformation With Dplyr
2 pages
Data Visualisation L9+L10 Lab 1 R Basics: Printing Character
No ratings yet
Data Visualisation L9+L10 Lab 1 R Basics: Printing Character
9 pages
A Short List of The Most Useful R Commands
No ratings yet
A Short List of The Most Useful R Commands
11 pages
A Short List of The Most Useful R Commands
No ratings yet
A Short List of The Most Useful R Commands
8 pages
R Reference Card
No ratings yet
R Reference Card
6 pages
Basics: TH TH TH TH TH TH TH
No ratings yet
Basics: TH TH TH TH TH TH TH
3 pages
Simple Tutorial in R
No ratings yet
Simple Tutorial in R
15 pages
R Reference Card
No ratings yet
R Reference Card
1 page
R Programming Cheat Sheet: Ata Tructures
No ratings yet
R Programming Cheat Sheet: Ata Tructures
2 pages
Lisp Interpreter in Rust
From Everand
Lisp Interpreter in Rust
Vishal Patil
1/5 (1)
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
Calculus I Essentials
From Everand
Calculus I Essentials
Editors of REA
1/5 (1)
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet

R Studio

Uploaded by

R Studio

Uploaded by

rm(list=ls()) ##create vectors x <- 1:20

#always creates an integer vector

#using c() x <- c(0.1,0.2)

#complex type x <-

#summary statistics of the data frame

#constructing using ################ ADDING/Removing columns

#creating a List x <- # Ways to remove the column

my_df$mark2 <- data(mtcars) # Loading mtcars

my_df my_df <- #create a variable obs_subset and have rows 4

write.csv(my_df,"marks1.csv") #create a variable var_subset and have only the

row.names(stu_temp) #exclude mpg and cyl columns subset4 <-

#filtering based on single

loan_cln <- #Randomly select no. of rows

#fetching last 6 rows

names(cars) #clear workspace

#arrange the data in ascending order of mpg loan_cln2 <-

#creating a new column #dimension of the data

#fetching last 6 rows

#viewing data #selecting single column

#summary cars %>%

#tbl_df(cars) #selecting multiple columns

#filtering based on multiple

#selecting columns matching regular expression

cars %>% #arrange the data in ascending order of mpg

#creating array from #to see the internal representation

#add or subtract to create new date(s) FALSE,sep = "\t") str(df)

#check for seq #Reading Excel file

datestring<-"August 17, 2022 04:20"

#Extracting parts of XML file- value of all nodes

#reading json file #unique values in a column

#checking for NA values unique(loan$Employment.Length)

unique(df$cyl) cyl_factor <- factor(df$cyl,levels = #setting bin size ggplot(mtcars,aes(x=hp))+geom_histogram(bins =

unique(df$gear) gear_factor <- factor(df$gear,levels = #setting bin width ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =

freq_gear <- table(gear_factor) freq_gear #facets ggplot(mtcars,aes(x=hp))+geom_histogram(color="white",fill="blue")+labs(title = "Distribution of

#install.packages("ggplot2") library(ggplot2) #with fill color ggplot(mtcars,

#scatter plot - multiple variables through size ################# box plot

You might also like