0% found this document useful (0 votes)
60 views13 pages

R Studio

This document summarizes key R programming concepts including vectors, matrices, data frames, and lists. It shows how to create, manipulate, and perform operations on various data types in R. Functions like c(), vector(), matrix(), data.frame() are used to construct different data structures. Operations like subsetting, binding, arithmetic, and coercion between different types are demonstrated.

Uploaded by

N K
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
60 views13 pages

R Studio

This document summarizes key R programming concepts including vectors, matrices, data frames, and lists. It shows how to create, manipulate, and perform operations on various data types in R. Functions like c(), vector(), matrix(), data.frame() are used to construct different data structures. Operations like subsetting, binding, arithmetic, and coercion between different types are demonstrated.

Uploaded by

N K
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

rm(list=ls()) ##create vectors x <- 1:20

#always creates an integer vector


#Assign a variable with an x
integer value a <- 10L a #class and length of a
is.integer(a) #to check whether the value is vector class(x)
integer or not length(x)

#using c() x <- c(0.1,0.2)


#character type
##numeric vector x <-
str <- 'R
c(TRUE,FALSE) ##logical
programming'
vector x <- c(T,F) ##logical
str s <-
vector x <- c("A","B","C")
"cse3505 -"
##character vector x <-
s class(s)
c(1L,2L,15L,27L) ##integer
#some useful functions vector x x <- c(1+2i,3)
paste(s,str) sprintf("%s has ##complex vector
scored %d marks","Sita",90)
#using
substr(str,start=5,stop=10)
vector() x
sub("e","C",str) <- vector()
str x length(x)
print(str) class(x)

#complex type x <-


cmp <- 21+10i vector("character",length
sqrt(-1) sqrt(-1+0i) = 10) x
sqrt(as.complex(-1)) #explicit type #Implicit type coercion -
conversion mixed objects y <- c(1.5,"a")
#character y y <- c(1.5,TRUE)
#numeric y

#logical type lg
<- TRUE y <- c(TRUE,"a")
p=TRUE;q=FAL #character y
SE
p&q;p|q;!p #Explicit type
coercion x <- 2.5
#Obtain the class and type of the variable class(x)
as.integer(x) x
class(a)
typeof(a) x <- -1:5 x
class(str) class(x)
typeof(str) as.numeric
class(cmp) (x)
typeof(cmp) as.logical(x)
class(lg) as.characte
typeof(lg) r(x)
as.complex
#special number Inf representing infinity
(x)
1/0
1/Inf log(0) #find
natural log.
#Non-sensical coercion results in NAs
#you can represent base value as 2nd x <- c('a','b','c')
argument log(10,2) #base 2 log(10,10) x as.
#base 10 numeric(x)
as.logical(x)
#NaN represents a undefined value (also indicates a missing value)
0/0 #vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y #missing values x
x-y x*y x/y <-
help(options c(1,2,NA,5,NaN,6)
) ?options is.na(x) is.nan(x)
options(digit
# Data frame ----------------------------------------------
s=2)
-------------------rm(list=ls())
#recycling
# table with the same type within a column and different types between columns #
rule y <-
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
c(2,4,6,8,10)
sample_df=data.frame(id,name,marks) sample_df
x+y
my_df <- data.frame(id = c(1, 2, 3),
#create
name = c("Ramu","Raju","Ravi"),
matrices m <- marks = c(50, 40, 25))
matrix() my_df
m
#dimension of the data frame
m <- matrix(nrow=3,ncol=2) dim(my_df)
m
attributes(m) dim(m) m <- matrix() m <- #columns of the data frame
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m names(my_df)
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise #structure of the data frame
m str(my_df)

#summary statistics of the data frame


#constructing from summary(my_df)
vector m <- 1:6
head(my_df) #top 6 rows in the data
dim(m) <- c(3,2) frame tail(my_df) #bottom 6 rows in the
m data frame

#constructing using ################ ADDING/Removing columns


column-binding x <- 1:3 x y # Ways to add a column
<- 10:12 y cbind(x,y)
my_df
#constructing using row- #initialize with 0
binding rbind(x,y) my_df$name
#matrix multiplication x <- my_df$perf <- 0
matrix(c(1,2,3,4),nrow=2,ncol=2) my_df
y <-
matrix(c(10,10,10,10),nrow=2,n my_df$perf <- c("very good","good","needs
col=2) x y x*y #does element- to improve") my_df
wise multiplication x%*%y
#can use [[]],[],[,] my_df[["perf"]] <-c("very
#does matrix multiplication
good","good","needs to improve")
#similarly, use x%/%y for matrix division. Otherwise, it does element-wise division my_df["perf"] <- c("very
good","good","needs to improve")
x t(x) #transpose of a my_df[,"perf"] <- c("very
matrix solve(x) #inverse good","good","needs to improve") my_df[5]
of a matrix det(x) # <- 0 my_df
determinant of a matrix

#creating a List x <- # Ways to remove the column


list(1,'a',TRUE,1+3i,6.7,c(10,20,
my_df[5] <- NULL
30)) x
my_df$V5 <- NULL
my_df my_df$perf
#factors x <- <- NULL
factor(c("male","female")) x x <- my_df[["perf"]] <-
factor(c("low","medium","high", NULL
"low")) table(x) unclass(x) my_df["perf"] <-
NULL my_df[5] <- boolv <-
NULL my_df$V5 <- stu_temp["mark2"]>10
NULL #subsetting boolv
row.names(stu_temp)[bo
df1 <- subset(my_df, olv]
select=c(id,marks)) df1 df1 <- #---------------------------------------------------------------
-----
subset(my_df, select=-marks)
library(help=datas
df1 View(df1) ets)

my_df$mark2 <- data(mtcars) # Loading mtcars


c(30,20,10) my_df data set cars <-mtcars # Save the
data into workspace
#sum of all marks
sum(my_df$mark2) # Viewing data set mtcars
# Total data set in console
#rowsum View(mtcars) # Viewing dataset in
my_df$total <- spreadsheet
rowSums(my_df[c(3,5)]) my_df
head(mtcars) # Viewing top-6 observations (default:
#max
top-6) tail(mtcars) # Viewing bottom 6
max(my_df$total)
observations str(mtcars) # Viewing data
#index at which max value is present
which.max(my_df$total) dictionary names(mtcars) # Viewing column
names v1 <- mtcars$mpg # Assigning single variable
#name of the student who got the from mtcars data to v1 v2 <- mtcars$cyl v3 <- mtcars$disp
max mark v4 <- mtcars$hp newvar <- mtcars$disp + mtcars$hp
my_df[["name"]][which.max(my_d
f$total)] mtcars1<-rbind(v1,v2,v3,v4) # Combined as rows
my_df$name[which.max(my_df$to #Horizontal joins mtcars1 mtcars2<-
tal)] cbind(v1,v2,v3,v4) # Combined as columns # Vertical
my_df[which.max(my_df$total),2] joins mtcars2

my_df my_df <- #create a variable obs_subset and have rows 4


rbind(my_df,data.frame(id=4,name="avgscore",marks=mean(my_df$marks),perf="meanperf",mark2=mean(my_df$mark to 10 in mtcars obs_subset <- mtcars[4:10,]
2),total=mean(my_df$total))) getwd() obs_subset

write.csv(my_df,"marks1.csv") #create a variable var_subset and have only the


write.csv(my_df,"marks.csv",row.names = FALSE) columns 1,5,9 var_subset <- mtcars[,c(1,5,9)]
var_subset
#R datasets
#subsetting
stu_marks <- read.csv("marks.csv") #create a variable subset1 and have only mpg and cyl variables of mtcars
str(stu_marks) #using indexing subset1 <-
mtcars[,c(1,2)] head(subset1,3)
stu_marks
#using subset() subset2 <-
stu_marks$mark2
subset(mtcars,select=c(mpg,cyl))
stu_marks[4]
subset2
stu_marks[3,3]
stu_marks[3,5] #create a variable subset3 and have only the rows
stu_marks where mpg>18 subset3 <- subset(mtcars,mpg>18)
stu_marks[c(1,3),c( subset3
2,5)]
#create a variable subset3 and have only the rows where
stu_temp <- stu_marks[c(-2,-4)] mpg>18 and cyl>5 subset4 <- subset(mtcars, mpg>18
stu_temp &cyl>5) subset4

row.names(stu_temp) #exclude mpg and cyl columns subset4 <-


row.names(stu_temp) <- subset(mtcars, mpg>18 &cyl>5, select=c(-mpg,-
stu_marks$name stu_temp cyl)) subset4
#install.packages("M #tbl_df(cars)
ASS") library(MASS) as_tibble(cars)
data("survey")
glimpse(cars)
#clear workspace
rm(list=ls()) ############ Subsetting Rows (Observations) #################

#filtering based on single


loan <- read.csv("loans data.csv") loan <- condition filter(cars, mpg>25)
read.table("loans data.csv",header = TRUE,sep
= ",") #filtering based on multiple
condition filter(cars, mpg>25&
dim(loan) hp >90)
str(loan)
head(loan,3) #Remove duplicate rows
tail(loan,2) distinct(cars)
summary(loan)
any(is.na(loan)) #Randomly select fraction of rows
sum(is.na(loan)) sample_frac(cars,0.2)

loan_cln <- #Randomly select no. of rows


na.omit(loan) sample_n(cars,5)
nrow(loan)
nrow(loan_cln) #selecting rows by position
slice(cars,11:15)
loan_cln2 <-
loan[complete.cases(loan),] slice_sample(cars,n=5)
nrow(loan_cln2)
temp <-
#Loading the dplyr package
library(dplyr) filter(cars,mpg>25)

#loading slice_sample(temp,n
data
data("mtca =2)
rs") cars <-
mtcars cars %>%
filter(mpg>25) %>%
#dimension of the data
slice_sample(n=2)
dim(cars)
#unique values in a column
#structure of the data unique(cars$cyl)
str(cars)
#no. of values under each unique category
#is.na(cars) #NA or NaN table(cars$cyl)
#checking for missing
#grouping
values any(is.na(cars))
cars %>%
sum(is.na(cars))
group_by(cyl)%
#################### Viewing data ########################
>%
#fetching top 6 rows slice_sample(n
head(cars) =2)

#fetching last 6 rows


############ Subsetting Columns (variables) #################
tail(cars)
#selecting single column
#viewing data
dplyr::select(cars,mpg)
View(cars)
cars %>%
#summary
dplyr::select(mpg)%>%
summary(cars) head(3)
#slice_sample(n=3)
cars
#selecting multiple columns dplyr::select(mpg,disp,hp,newv
dplyr::select(cars,mpg,cyl,gear) ar2)%>% sample_n(2)
dplyr::select(cars,c("mpg","cyl","ge
ar")) ############ summarizing data
names(cars) #################
#select all columns between a range of columns #Always group_by is used along with summarise. It is applied on categorical value
(inclusive) dplyr::select(cars,hp:am) cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values
#combining filter and select- using pipe operator table(cars$cyl)
cars %>%
filter(mpg>18)%>% #computing max, min and standard dev cars %>% group_by(cyl) %>%
dplyr::select(mpg,cyl)%>% summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
head(3) mpg),md=median(mpg))

names(cars) #clear workspace


#selecting columns starting with 'd' rm(list=ls())
dplyr::select(cars,starts_with('d'))
loan <- read.csv("loans data.csv") loan <-
#selecting columns ending with 't'
read.table("loans data.csv",header = TRUE,sep
dplyr::select(cars,ends_with('t'))
= ",")
#selecting columns
dim(loan)
containing 'g'
str(loan)
dplyr::select(cars,contains('g
head(loan,3)
')) #selecting columns
matching regular expression tail(loan,2)
dplyr::select(cars,matches('.. summary(loan)
a.')) any(is.na(loan))
sum(is.na(loan))
#Excluding certain columns
select(cars,c(-mpg,-cyl)) loan_cln <-
na.omit(loan)
############ Arranging data nrow(loan)
################# nrow(loan_cln)

#arrange the data in ascending order of mpg loan_cln2 <-


arrange(cars,mpg) loan[complete.cases(loan),]
nrow(loan_cln2)
#arrange the data in descending order of mpg
arrange(cars,desc(mpg)) #Loading the dplyr package
library(dplyr)
#arrange the data in order based on more than
one column arrange(cars, mpg,disp) #loading
data
arrange(cars,mpg,desc(disp)) data("mtca
rs") cars <-
############ Making new variables ################# mtcars

#creating a new column #dimension of the data


mutate(cars,newvar=disp-hp ) dim(cars)

#combining functions
#create a new variable that sum up disp and hp #structure of the data
and filter only str(cars)
#the rows where mpg>25 & #is.na(cars) #NA or NaN
disp>90 #and select only mpg, #checking for missing values
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>% any(is.na(cars))
filter(mpg>25,disp>90)%>% sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows slice_sample(n
head(cars) =2)

#fetching last 6 rows


tail(cars) ############ Subsetting Columns (variables) #################

#viewing data #selecting single column


View(cars) dplyr::select(cars,mpg)

#summary cars %>%


summary(cars) dplyr::select(mpg)%>%
head(3)
cars #slice_sample(n=3)

#tbl_df(cars) #selecting multiple columns


as_tibble(cars) dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
glimpse(cars) ar"))
############ Subsetting Rows (Observations) ################# names(cars)
#filtering based on single #select all columns between a range of columns
condition filter(cars, mpg>25) (inclusive) dplyr::select(cars,hp:am)

#filtering based on multiple


condition filter(cars, mpg>25& #combining filter and select- using pipe operator
hp >90)
cars %>%
#Remove duplicate rows filter(mpg>18)%>%
distinct(cars) dplyr::select(mpg,cyl)%>%
head(3)
#Randomly select fraction of rows names(cars)
sample_frac(cars,0.2)
#selecting columns starting with 'd'
#Randomly select no. of rows dplyr::select(cars,starts_with('d'))
sample_n(cars,5)
#selecting columns ending with 't'
#selecting rows by position dplyr::select(cars,ends_with('t'))
slice(cars,11:15)
#selecting columns containing 'g'
slice_sample(cars,n=5) dplyr::select(cars,contains('g'))

#selecting columns matching regular expression


temp <-
dplyr::select(cars,matches('..a.'))
filter(cars,mpg>25) #Excluding certain columns
select(cars,c(-mpg,-cyl))
slice_sample(temp,n
############ Arranging data
=2) #################

cars %>% #arrange the data in ascending order of mpg


filter(mpg>25) %>% arrange(cars,mpg)
slice_sample(n=2) #arrange the data in descending order of mpg
#unique values in a column arrange(cars,desc(mpg))
unique(cars$cyl) #arrange the data in order based on more than
#no. of values under each unique category one column arrange(cars, mpg,disp)
table(cars$cyl) arrange(cars,mpg,desc(disp))
#grouping
cars %>% ############ Making new variables #################
group_by(cyl)%
>%
#creating a new column M1 <- A1[,,1]
mutate(cars,newvar=disp-hp ) M2 <- A1[,,2]
M3 <- M1+M2
#combining functions
M3
#create a new variable that sum up disp and hp
and filter only M1
#the rows where mpg>25 & #Aggregation on array
disp>90 #and select only mpg, elements apply(M1,1,sum)
disp, hp, newvar cars %>% #1- along row
mutate(newvar2=disp+hp)%>% apply(M2,2,sum) #2 -along
filter(mpg>25,disp>90)%>% column A1
dplyr::select(mpg,disp,hp,newv apply(A1,1,sum)
ar2)%>% sample_n(2) apply(A1,2,mean)

rm(list=ls())
############ summarizing data
################# #To create date / To
#Always group_by is used along with summarise. It is applied on categorical value represent date d <- date()
cars %>% group_by(cyl) %>% d class(d)
summarize(cnt=n()) #count of unique
#as.Date(d)
cyl values
#to convert date string to date class d <-
table(cars$cyl) as.Date("2022-8-25") #default format -year-
month-day class(d)
#computing max, min and standard dev cars %>% group_by(cyl) %>% d as.Date("2022-8-25
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean( 10:44:22")
mpg),md=median(mpg)) as.Date("2022-8-25
21:15")
rm(list=ls())

#creating array from #to see the internal representation


vectors v1 <- c(1,2,3) unclass(d)
v2 <- c(4,5,6,7,8,9)
#to represent both date and
A1 <- array(c(v1,v2),dim = c(3,3,2)) time as.POSIXct("2022-8-25")
A1 pd <- as.POSIXct("2022-8-25
21:15") pd
#naming columns and rows class(pd)
rname <- c("r1","r2","r3") cname unclass(pd)
<- c("c1","c2","c3") mname <-
pd <- as.POSIXlt("2022-8-
c("mat1","mat2") dimnames(A1)
25") pd
<- list(rname,cname,mname)
class(pd)
A1 <- array(c(v1,v2),dim = c(3,3,2),dimnames = list(rname,cname,mname))
A1 #getting meta using
unclass() unclass(pd)
#printing the second row of second matrix names(unclass(pd))
A1[2,,2]
pd <- as.POSIXlt("2022-8-17
A1["r2",,"mat2"]
21:15:30") pd$sec pd$hour
#printing the second column of first matrix
pd$min pd$mday pd$year
A1[,2,1] unlist(pd)
A1[,"c2","mat1"] #if format is different
#printing the element in the 2nd row and 3rd column of second matrix as.Date("25/8/2022",format="%d/%
A1[2,3,2] m/%Y") date() as.Date("August
25,2022",format="%B %d,%Y")
as.Date("25Aug22",format="%d%b%
#printing the second matrix y")
A1[,,2]
A1[,,"mat2"] #Checking the class
class(as.Date("2022-8-25 21:15"))
#Manipulating array elements class(as.POSIXct("2022-8-25
21:15")) class(as.POSIXlt("2022-8- # dir.create("data")
25 21:15"))
#fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
#download.file(fileurl,destfile = "E:/sweetlin-personal/coursera/data/camera.csv")
#Getting date, time and #list.files("E:/sweetlin-personal/coursera/data")
zone p <- Sys.Date() #only
current date class(p) #dateofdownload <- date()
Sys.time() #current date, time and timezone #dateofdownload
Sys.timezone() rm(list=ls())
#Reading flat file using read.table() loan <-
#difference in dates Sys.Date()-
read.table("loans data.csv",header = TRUE,sep
as.Date("1979-03-21")
= ",") str(loan) head(loan,2)
difftime(Sys.Date(),as.Date("1979-03-
21"),units = "weeks") #getting weekdays and #Reading flat file using read.csv() loan1 <-
basic arithmetic d <- as.Date("2022-8-17") d
#to find weekday of the date read.csv("loans data.csv") str(loan1) df <-
weekdays(d) read.table("tabsepfile.txt",header =

#add or subtract to create new date(s) FALSE,sep = "\t") str(df)


d+1 d+1:5
weekdays(d+ df <- read.table("slashsepfile.txt",header = FALSE,sep="/",strip.white = TRUE,na.strings = "EMPTY")
1:5) str(df)

#check for seq #Reading Excel file


and rep #using #you need to import xlsx package
sequence d #install.packages("xlsx")
dt <- seq(d,by="2 #library(xlsx)
months",length.out = 6) dt #loan <- read.xlsx("loan.xls",sheetIndex=1,
header=TRUE)
#getting month and
quarter months(d) #install.packages("XLConnect")
months(dt) #library(XLConnect)
quarters(dt)
library(readxl)
#lubridate::today() #lubridate package #excel_sheets('E:/sweetlin-official/FALL 2020 -2021/CSE3505/R
#ISOdate(2021,8,25) programs/loans data.xlsx') excel_sheets("loans data.xlsx") df <-
read_excel("loans data.xlsx",sheet="sample")
?strptime
help("strptime") str(df)

datestring<-"August 17, 2022 04:20"


convertedForm<- #XLConnect, XLSX, readxl
strptime(datestring,"%B %d, %Y %H:%M")
class(convertedForm) convertedForm #Reading XML file
#You need to install XML
x <- as.Date("2020-01-01") y <- strptime("25 Aug package
2020 09:00:00", "%d %b %Y %H:%M:%S") x-y install.packages("XML")
class(x) library(XML)
#library(methods)
class(y)
#install.packages("RCurl") #library(RCurl)
x <- as.POSIXlt(x) library(httr) fileurl <-
x-y "https://www.w3schools.com/xml/simpl
e.xml" xmldata <- GET(fileurl) doc <-
#different time zones x <- xmlTreeParse(xmldata,useInternal=TRUE)
as.POSIXct("2021-08-25
08:00:00") x root <-
xmlRoot(doc)
xgmt<-as.POSIXct("2021-08-25 08:00:00", root
tz="GMT") xgmt xmlName(root)
names(root)
xgmt-x
#Accessing parts of xml file in the same
#if(!file.exists("data")) way as list root[[1]] #accessing 1st
food root[[1]][[1]] #accessing name of #view of the data in a table
the 1st food View(loan)

#Extracting parts of XML file- value of all nodes


xmlSApply(root,xmlValue) #fetching top 6 rows
head(loan)
root <- xmlSApply(root,function(x) xmlSApply(x,xmlValue))
#fetching last 6 rows
root tail(loan)
#Extracting individual nodes of #summary of the data
XML file summary(loan)
xpathSApply(root,"//name",xmlVal
ue) ############ Cleaning data
xpathSApply(root,"//price",xmlVal #################
ue) #checking for missing values in the data
any(is.na(loan)) #NA NaN
xml_df <- data.frame(t(root),row.names =
NULL) str(xml_df)
#checking for the total no. of missing values in
the data sum(is.na(loan))
#Reading JSON file #cleaning NA values
#Loading jsonlite loan_clean <-
package na.omit(loan)
library(jsonlite) jdata <-
fromJSON("https://api.github.com/users/jtleek sum(is.na(loan_clean)) str(loan_clean)
/repos") names(jdata) loan_clean1 <- loan[complete.cases(loan),]
class(jdata) str(jdata) #boolean indexing sum(is.na(loan_clean1))
head(jdata,2)
#Extracting nested #imputation - filling the missing values
objects #cleaning Amount.Requested Column
names(jdata$owner) #checking for the total no. of missing values in a particular column
jdata$owner$login
sum(is.na(loan$Amount.Requested)
)

unique(loan$Amount.Requested)
#writing to json file
data(iris) str(iris) #changing to numeric types
head(iris,2) jfile <- loan$Amount.Requested <-
toJSON(iris,pretty = TRUE) as.integer(loan$Amount.Requested)
cat(jfile) str(loan)

#reading json file #unique values in a column


irisdata <- unique(loan$Amount.Requested)
fromJSON(jfile)
mean(loan$Amount.Requested,na.rm = TRUE)
head(irisdata)
median(loan$Amount.Requested,na.rm = TRUE)
#clear workspace
#library(dplyr)
rm(list=ls())
library(tidyverse)
############ Reading data #Decide whether to impute with mean or median loan %>%
################# summarize(avg=mean(Amount.Requested,na.rm =
#using read.table() loan_data <- read.table("loans TRUE),med=median(Amount.Requested,na.rm = TRUE))
data.csv",header = TRUE,sep = ",") loan <- loan <- loan %>%
read.csv("loans data.csv")
mutate(Amount.Requested=replace(Amount.Requested,is.na(Amount.Requested),median(Amount.Req
#dimension of the data uested,na.rm = TRUE)))
dim(loan)
sum(is.na(loan$Amount.Requested)
#structure of the data )
str(loan)
#Rename a column loan <- loan%>%
loan %>% filter(is.na(Loan.Length))
rename(Amt_Req=Amount.Re
quested) names(loan) #drop the rows with
str(loan) NA values loan <-
#cleaning Amount.Funded.By.Investors column loan%>%
sum(is.na(loan$Amount.Funded.By.Investors)) drop_na(Loan.Length)

unique(loan$Amount.Funded.By.In #checking
vestors) str(loan) sum(is.na(loan$Loan.Length))
loan <- loan%>%
unique(loan$Loan.Length)
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <- #cleaning Employment.Length column
as.numeric(loan$Amt_fund) sum(is.na(loan$Employment.Length))

#checking for NA values unique(loan$Employment.Length)


sum(is.na(loan$Amt_fund))
loan <- loan %>% mutate(Employment.Length=gsub(" year|
#check impute with mean or years|< |\\+","",Employment.Length))
median loan%>%
loan$Employment.Length <- as.integer(loan$Employment.Length)
summarize(avg=mean(loan$Amt_f
und,na.rm = #checking
TRUE),md=median(loan$Amt_fund, unique(loan$Employment.Length)
na.rm = TRUE)) sum(is.na(loan$Employment.Len
gth))
loan <- loan%>%
mutate(Amt_fund=replace(Amt_fund,is.na(Amt_fund),median(Amt_fund, table(loan$Employment.Length)
na.rm = TRUE))) mean(table(loan$Employment.Len
gth))
sum(is.na(loan$Amt_fund))
loan <- loan%>%
str(loan)
mutate(Employment.Length=replace(Employment.Length,is.na(Employment.Len
#cleaning Interest.Rate column
gth),2))
sum(is.na(loan$Interest.Rate))
#checking
#cleaning unwanted substring in a chr
sum(is.na(loan$Employment.Lengt
column loan <- loan %>%
h))
mutate(Interest.Rate=gsub("%","",Int
unique(loan$Employment.Length)
erest.Rate))
head(loan$Interest.Rate,2) #cleaning FICO.Range column
head(loan$FICO.Range,2) loan <-
loan$Interest.Rate <- as.numeric(loan$Interest.Rate)
loan %>%
head(loan$Interest.Rate,2)
separate(FICO.Range,c("fico-
loan$Interest.Rate <- low","fico-high")) str(loan)
as.numeric(loan$Interest.Rate) str(loan) names(loan)

loan$`fico-high` <-
#cleaning Loan.Length column as.integer(loan$`fico-high`)
sum(is.na(loan$Loan.Length)) loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
unique(loan$Loan.Length)
sum(is.na(loan$`fico-high`))
loan <- loan %>% sum(is.na(loan$`fico-low`))
mutate(Loan.Length=gsub(" unique(loan$`fico-high`)
months","",Loan.Length)) unique(loan$`fico-low`)
#statistical analysis - Numerical measure
loan$Loan.Length <- as.integer(loan$Loan.Length) str(faithful) #faithful - built-in data
head(faithful)
sum(is.na(loan$Loan.Length))
#Central tendency measure
unique(loan$Loan.Length)
mean(faithful$eruptions)
#filtering the rows with NA values #median
median(faithful$eruptions) #relative frequency relfreq
<-
#Measure of dispersion Interval_freq/nrow(faithful)
range(faithful$eruptions) old=options(digits = 2)
max(faithful$eruptions)- cbind(Interval_freq,relfreq)
min(faithful$eruptions)
#quartile #cumulative frequency
quantile(faithful$eruption cumfreq <-
s) cumsum(table(interval))
cumfreq cbind(cumfreq)
#Inter-quartile range
IQR(faithful$eruptions) rm(list=ls())
library(help=graphi
#percentile cs)
quantile(faithful$eruptions,c(.27,.3 data("airquality")
5,.65)) str(airquality)
#variance #to set the margin
var(faithful$eruptions) par(mar=c(2,2,2,2))
#standard deviation #1D scatter plot
sd(faithful$eruptions) plot(airquality$Ozone)
#covariance #2D scatter plot
cov(faithful$eruptions,faithful$wai
ting) plot(airquality$Ozone,airquality$W
ind)
#correlation
cor(faithful$eruptions,faithful$wai ?plot
ting)
#type argument in plot
#moment -third central moment plot(airquality$Ozone,type="l")
# the second central moment of a population
#title and axis labels arguments
is its variance library(e1071)
plot(airquality$Ozone,main = "ozone levels",xlab =
moment(faithful$eruptions,3, center = TRUE)
"index",ylab = "ozone")
#skewness
skewness(faithful$eruptions)
#histogram
#kurtosis hist(airquality$Solar.R)
kurtosis(faithful$eruptions)
#boxplot
#frequency summary(airquality$Ozone)
distributio #step1 - boxplot(airquality$Ozone)
find range
range(faithful$erupti #multiple boxplot
ons) boxplot(airquality[,1:4],main="multiple
box plots")
#step2 - Break the range into non-overlapping sub-intervals by defining a sequence of
equal distance break points. breaks <- seq(1.5,5.5,by=0.5) breaks
#pie chart
#step3- Classify the eruption durations according to the half-unit-length sub- unique(airquality$Wind)
intervals with cut. interval <- cut(faithful$eruptions,breaks,right=FALSE) table(airquality$Wind)

#step 4 - Compute the frequency of eruptions in each sub-interval with the table function. wind_freq <- table(airquality$Wind)
Interval_freq = wind_above8 <- wind_freq>8
table(interval) wind_freq wind_above8
Interval_freq wind_above8data <-
cbind(Interval_freq) wind_freq[wind_above8]
wind_above8data
table(wind_above8)
pie(wind_above8data,radius=1) #scatter plot - multiple variables through both color and shape
par(mar=c(1,1,1,1)) ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)
#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n") ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
plot(airquality$Ozone) size=1.5)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l') #scatter plot- adding best fit line ggplot(mtcars,
barplot(airquality$Ozone, main = 'Ozone levels', ylab aes(x=wt,y=mpg))+geom_point()+geom_smooth(method="
= 'ozone value') hist(airquality$Solar.R) lm")
boxplot(airquality$Ozone)
###########bar plot ########### ggplot(mtcars,
aes(x=gear_factor))+geom_bar() ggplot(mtcars,
#lattice graph aes(x=gear_factor,fill=gear_factor,color="red"))+geom_bar() +ggtitle("frquency
library(lattice) plot of gear")
#density plot #flipping the bar direction ggplot(mtcars,
densityplot(airquality$Ozone) aes(x=gear_factor))+geom_bar()+coord_flip()
#scatter plot matrix #bar plot for 2 variables ggplot(mtcars,
splom(airquality[c(1,3,4)]) aes(x=cyl_factor,fill=gear_factor))+geom_bar(position='sta
ck')
#scatter plot depicting the combination
#################### pie chart ############ ggplot(mtcars,
of 2 variables data("mtcars") df <-
aes(x="",y=mpg,fill=cyl_factor))+geom_bar(width =
mtcars
1,stat='identity')+coord_polar("y",start = 0)
str(df)
par(mar=c(4,4.5,1, #################### histogram ###########
1)) ggplot(mtcars,aes(x=hp))+geom_histogram()+labs(title = "Distribution of
plot(df$wt,df$mpg) hp",y='frequency')

unique(df$cyl) cyl_factor <- factor(df$cyl,levels = #setting bin size ggplot(mtcars,aes(x=hp))+geom_histogram(bins =


c(4,6,8),labels = c("4cyl","6cyl","8cyl")) 3)+labs(title = "Distribution of hp",y='frequency')

unique(df$gear) gear_factor <- factor(df$gear,levels = #setting bin width ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =


c(3,4,5),labels = c("3 gears","4 gears", "5 gears")) 30)+labs(title = "Distribution of hp",y='frequency')

xyplot(df$mpg~df$wt|cyl_factor*gear_factor,main="scatter plots: Cylinders and Gears",xlab = "weight of #with border and fill color ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =
car",ylab = "miles per gallon") gear_factor 30,color='green',fill='yellow')+labs(title = "Distribution of hp",y='frequency')

freq_gear <- table(gear_factor) freq_gear #facets ggplot(mtcars,aes(x=hp))+geom_histogram(color="white",fill="blue")+labs(title = "Distribution of


barplot(freq_gear,col=c("red","green","blue")) hp",y='frequency')+facet_wrap(cyl_factor,ncol=1)
pie(freq_gear,labels=c("3 gears","4 gears", "5
gears"),col=c("red","green","blue"),radius=1)
################ Kernel density curve ############ ggplot(mtcars,
rm(list=ls()) aes(x=hp))+geom_density()+labs(title="Distribution of hp",x="horse
data("mtcars") power",y='density')

#install.packages("ggplot2") library(ggplot2) #with fill color ggplot(mtcars,


head(mtcars,2) #scatter plot ggplot(data=mtcars, aes(x=hp))+geom_density(fill='blue',color='red')+labs(title="Distribution of
mapping=aes(x=wt,y=mpg))+geom_point() hp",x="horse power",y='density')
unique(mtcars$cyl) cyl_factor <-
############## Line plot ###############
factor(mtcars$cyl,levels = c(4,6,8),labels =
library(dplyr) d <-
c("4cyl","6cyl","8cyl"))
sample_n(mtcars,10)
unique(mtcars$gear) gear_factor <- factor(mtcars$gear,levels = ggplot(d,aes(x=wt,y=drat))+ge
c(3,4,5),labels = c("3 gears","4 gears", "5 gears")) om_line() d

#scatter plot - multiple variables through color #with varied thickness and color points
ggplot(mtcars,aes(x=wt,y=mpg,color=gear_factor))+geom_point() ggplot(d,aes(x=wt,y=drat))+geom_line(aes(size=2,color='red'))+geom_point(aes(size=2,color='blue'))

#scatter plot - multiple variables through size ################# box plot


ggplot(mtcars,aes(x=wt,y=mpg,size=qsec))+geo ################ ggplot(mtcars,
m_point() aes(x=mpg))+geom_boxplot()
#multiple box plots ggplot(mtcars,
aes(x=cyl_factor,y=mpg))+geom_boxplot()

You might also like