R Note
R Note
## Lets find out how many people with age less than 18 are filing for
claims
work_data[ which(work_data$AGE < 18), ]
work_data
## Lets do some more analysis
## Group the age into buckets
## Add a new variable agegroup with these buckets
work_data$agegroup <- cut(work_data$AGE,
breaks = c(0,35,50, 100),
labels = c("less than 35", "35 to 50", "more
than 50"))
work_data$agegroup
## Now see the summary data with new field added
summary(work_data)
## Example of ggplot
ggplot(data = work_data, mapping = aes(x = SEX, y = AGE)) +
geom_boxplot( aes(colour = EDUCATION), outlier.colour = "red")
## Bi-Variate analysis
corrplot(cor(uber[,4:12]))
plot(uber$spd, uber$pickups, xlab= "speed", ylab="pickup", main
="pickup vs speed")
abline(lm(uber$pickups~uber$spd))
plot(aggregate(pickups~start_month,data=uber, sum), type="b")
uber %>%
filter(.,start_month !=2) %>%
ggplot(aes(x=start_day, y=pickups))+geom_bar(stat='identity')
plot(aggregate(pickups~start_hour,data=uber, sum), type="b")
ggplot(aes(x = reorder(wday, pickups), y = pickups), data = uber) +
geom_bar(aes(fill=pickups), width=0.5, stat = "identity") +
coord_flip()
ggplot(uber, aes(start_hour, pickups)) +
geom_jitter(alpha = 0.3, aes(colour = borough)) +
geom_smooth(aes(color = borough))
ggplot(uber, aes(start_hour, borough)) +
geom_jitter( alpha = 0.4, aes(color = pcp24 > 0)) +
geom_smooth(aes(color = pcp24 > 0))
## Coffee
install.packages("reshape", type="source")
install.packages("reshape2", type="source")
library(reshape)
library(reshape2)
library(reshape)
library(reshape2)
library(ggplot2)
library(plyr)
library(grid)
install.packages("gridExtra")
library(gridExtra)
coffee=read.csv("D:/sahubackup/GL/Coffee-1.csv")
coffee
dim(coffee)
attach(coffee)
histogram(Days_between_Purchase)
count_Brand<-count(coffee$Brand)
count_Brand
data_num <- as.data.frame(apply(coffee, 2, as.numeric))
data_num
ggplot(count_Brand,aes(Brand,Count))+geom_bar(stat = "Identity")
corcoffee=cor(coffee)
corcoffee
corrplot(corcoffee)
cor(Price_per_Packet,Income)
library(ggplot2)
ggplot(coffee, aes(x = Days_between_Purchase)) + geom_density()
par(mfrow=c(3,2))
barplot(brand$freq,names.arg=brand$x,main="BRAND")
barplot(edu$freq,names.arg=edu$x,main="Education")
cbind(count(coffee$sec),(count(coffee$sec))/sum(count(coffee$sec)$freq
))
cast(count(coffee[,c("sec","price_per_packet")], c("sec",
"price_per_packet"))
,sec~ price_per_packet)
coffee_new<-subset(coffee,Days_between_Purchase>1)
summary(coffee_new$Days_between_Purchase)
ggplot(coffee_new,aes(Brand,Days_between_Purchase))+geom_boxplot()
cost<-coffee[,c("Brand","Price_per_Packet")]
cost<-count(cost, c("Brand", "Price_per_Packet"))
cost<-cast(cost,Brand ~ Price_per_Packet)
par(mfrow=c(3,2))
barplot(Brand$freq,names.arg=Brand$x,main="BRAND")
barplot(Education$freq,names.arg=Education$x,main="Education")
qplot(coffee$Days_between_Purchase,data=coffee)
ggplot(coffee,aes(Brand,coffee$Days_between_Purchase))+geom_boxplot()
quantile(coffee$Days_between_Purchase)
q1<-6
q3<-17
iqr<-q3-q1
upper<-q3+(iqr*1.5)
upper
coffee_new<-subset(coffee,Days_between_Purchase<upper)
summary(coffee_new$Days_between_Purchase)
ggplot(coffee_new,aes(x=Brand,y=Days_between_Purchase))+geom_boxplot()
barplot(age$freq,names.arg=age$x,main="Age")
barplot(sec$freq,names.arg=sec$x,main="Social Economic Status")
barplot(income$freq,names.arg=income$x,main="Income")
barplot(price_cons$freq,names.arg=price_cons$x,main="Price Conscious")
/////////////HeartDisease/////////////
install.packages("car")
install.packages("tidyr")
install.packages("caret")
install.packages("broom")
install.packages("ROCR")
library(ggplot2)
library(car)
library(dplyr)
library(lattice)
library(tidyr)
library(caret)
library(MASS)
library(broom)
library(ROCR)
table(heart_data$chest_pain)
ggplot(heart_data,aes(x = cp)) +
geom_bar(width =0.2,fill ="red") +
geom_text(stat = 'count',aes(label =..count..),vjust = -0.5)
#rest_bp
class(trestbps)
ggplot(heart_data, aes(x=heart_data$trestbps,y=heart_data$chol,col =
"dodgerblue2",
main ="boxplot of
rest_bp",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data, aes(x=heart_data$cp,y=heart_data$trestbps,col =
"dodgerblue2",
main ="boxplot of
rest_bp",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data,aes(trestbps)) + geom_histogram(bins =20,fill
="green") +theme_bw() + theme_classic() +ggtitle("resp_bp")
ggplot(heart_data,aes(trestbps)) + geom_density(fill ="dodgerblue4") +
theme_bw() + theme_classic()+ggtitle("density plot of resp_bp")
#chol
ggplot(heart_data, aes(x=heart_data$chol,col = "dodgerblue2"
,main ="boxplot of
chol",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data,aes(chol)) +
geom_histogram(bins =20,fill ="green") +
ggtitle("chol")
ggplot(heart_data,aes(chol)) +
geom_density(fill ="dodgerblue4") +
theme_bw() + theme_classic()+ggtitle("density plot of chol")
table(heart_data$fasting_bloodsugar)
ggplot(heart_data,aes(x =factor(fasting_bloodsugar))) + geom_bar(width
= 0.1,fill ="green") + geom_text(stat = 'count',aes(label
=..count..),vjust =-0.5) + theme_bw() + theme_classic() +ylab("number
of count") + ggtitle("blood sugar") + title.center
ggplot(heart_data,aes(factor(fasting_bloodsugar))) + geom_bar(width =
0.2,fill ="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
fasting_bloodsugar") +title.center
#max heart-rate
ggplot(heart_data,aes(max_heartrate)) + geom_histogram(fill =
"dodgerblue4",alpha =0.5) + theme_bw()+theme_classic()
ggplot(heart_data,aes(max_heartrate)) + geom_density(fill =
"red",alpha =0.5) + theme_bw()+theme_classic()
boxplot(heart_data$max_heartrate,col ="lightblue",notch = T,main
="boxplot of the maximum heart rate")
ggplot(heart_data,aes(factor(excercise_angina))) + geom_bar(width =
0.2,fill ="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
exercise angina")
##Correlation
heart_data$sex
class(heart_data$sex)
str(heart_data)
heart_data_wcp=subset(heart_data,select= -2)
heart_data_wcp
str(heart_data_wcp)
cor(heart_data_wcp)
install.packages("corrplot")
library(corrplot)
corl=cor(heart_data_wcp)
corl
corrplot(corl, method = "pie", type = "lower")
round(table(heart_data$target)/nrow(heart_data), digits = 2) ##So in
our dataset 54% has a heart disease while the rest does not.
##MR
library(caTools)
set.seed(123)
split = sample.split(heart_data$target, SplitRatio = 0.8)
training_set = subset(heart_data, split == TRUE)
training_set
test_set = subset(heart_data, split == FALSE)
test_set
library(ggplot2)
ggplot() +
geom_point(aes(x = training_set$cp, y = training_set$target),
colour = 'red') +
geom_line(aes(x = training_set$cp, y = predict(regressor3, newdata =
training_set)),
colour = 'blue') +
ggtitle('cp vs target(Training set)') +
xlab('cp') +
ylab('target')
library(ggplot2)
ggplot() +
geom_point(aes(x = test_set$cp, y = test_set$target),
colour = 'red') +
geom_line(aes(x = training_set$cp, y = predict(regressor3, newdata =
training_set)),
colour = 'blue') +
ggtitle('cp vs target(Test set)') +
xlab('cp') +
ylab('target')
library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary),
colour = 'red') +
geom_line(aes(x = x_grid, y = predict(regressor, newdata =
data.frame(Level = x_grid))),
colour = 'blue') +
ggtitle('Truth or Bluff (Random Forest Regression)') +
xlab('Level') +
ylab('Salary')
/////////////////// CarSeats//////////////////////
rm(list = ls())
install.packages("MASS")
install.packages("psych")
install.packages("Boruta")
library(Boruta)
################################## Cleaning the R Environment and
Loading Libraries ##################################
rm(list = ls())
setwd("C:/OldStuff/")
wants <- c("readxl", "plyr", "data.table", "dplyr", "tidyr",
"stringi", "stringr", "lubridate",
"BLPestimatoR", "dummies", "zoo", "car", "caret", "MASS",
"caTools", "Boruta",
"lmtest")
has <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])
lapply(wants, require, character.only = TRUE)
rm("wants","has")
# Inference:- We can see that there is steep jump from the first bin
to the second bin,
# third to fourth bin and fourth to fifth bin.
# Therefore we will bin Age variable into 4 bins, 0-29, 30-49, 50-59,
60+
## Binning the Age variable as per the inference from the histogram
carSeats$ageGroup <- cut(carSeats$Age, breaks=c(0, 29, 49, 59, 1000),
labels=c("lessthan30","30to49","50to59",
"60+"))
carSeats$ageGroup <- as.character(carSeats$ageGroup)
#If we compare the results from Boruta and Stepwise we can see that,
there are 7
# common variables chosen by both the methods. Stepwise method chooses
one
# additional variable
///////////////// CArdioFitness////////////
r<-"hello"
r
w="C://Users//00002998//CardioGoodFitness.csv"
ds=read.csv(w,1)
ds
library(caTools)
set.seed(123)
library(caTools)
set.seed(123)
split=sample.split(ds$Income, SplitRatio = 2/3)
training_set=subset(ds,split==TRUE)
test_set=subset(ds,split=FALSE)
training_set
dim(ds)
dim(training_set)
names(ds)
str(ds)
ds[1:10,]
ds[1:10,"Sepal.Length"]
ds[1:10,"Product"]
summary(ds)
table(ds$Product)
table(Product, Gender)
table(ds$Product, ds$Gender)
boxplot(Age~Product, horizontal=TRUE, col=c("Green","Red"))
boxplot(ds$Age~ds$Product, horizontal=TRUE, col=c("Green","Red"))
attach(ds)
table(Product, Gender)
by (m, INDICES=Product, FUN = summary)
summary(ds)
by(ds, indices=Product, FUN=summary)
by(ds, INDICES=Product, FUN=summary)
rpivotTable(ds)
install.rpivottable
library(lattice)
histogram(~Miles|factor(Product),data=ds)
cor(Miles,Usage)
Model=lm(Miles~Usage, data=ds)
summary(Model)
pie(table(Product, Gender))
pie(table(Product))
plot(density(Product))
plot(density(Salary))
plot(density(Income))
file="C://Users//00002998//CardioGoodFitness.csv"MyDataset=read.csv(fi
le,header=TRUE)
file="C://Users//00002998//CardioGoodFitness.csv"
MyDataset=read.csv(file,header=TRUE)
MyDataset
plot(Income, Gender)
plot(Income, Product)
plot(ds)
library("party")
library(ctree)
ds_ctree=ctree(Product~, Product)
file="C://Users//00002998//iris_flowers_new.csv"
iris=read.csv(file,1)
iris
iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data=iris)
install.packages("party")
library("party")
iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data=iris)
iris_ctree <- ctree(Flower ~ Sepal.Length + Sepal.Width + Petal.Length
+ Petal.Width, data=iris)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iri)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iri)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iris)
iris_ctree <- ctree(flower ~ sepal.length + sepal.width +
petal.length,data=iris)
iris_ctree
plot(iris_ctree)
boxplot()
plot(jitter(sepal.length), jitter(sepal.width))
attach(iris)
plot(jitter(sepal.length), jitter(sepal.width))
smoothScatter(sepal.length, sepal.width)
distMatrix <- as.matrix(dist(iris[,1:4]))
heatmap(distMatrix)
iris_pdf=pdf("iris.pdf")
heat=heatmap(distMatrix)
iris_pdf=pdf("heat.pdf")
iris_pdf
set.seed(1234)
split=sample.split(iris, SplitRatio = 2/3)
training_set=subset(iris,split==TRUE)
test_set=subset(iris,split=FALSE)
split=sample.split(iris, SplitRatio = 2/3)
training_set=subset(iris,split==TRUE)
test_set=subset(iris,split=FALSE)
training_set
myFormula=flower ~ sepal.length + sepal.width + petal.length,data=iris
myFormula=flower ~ sepal.length + sepal.width + petal.length
myf_ctree=ctree(myFormula, data=training_set)
table(predict(myf_ctree), training_set$flower)
print(myf_ctree)
plot(myf_ctree)
plot(myf_ctree,type="simple")
test_pred=predict(myf_ctree, newdata = test_set)
table(test_pred, test_set$flower)
setwd ("C:/Users/00002998")
/
setwd ("C:/Users/00002998/R Programming")
getwd()
setwd("D:/sahubackup/GL/R Programming")
getwd()
//////////////ANOVA///////////////
library(plot3D)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
x
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, clab = c("Sepal", "Width (cm)"))
scatter3D(x, y, z, bty = "f", colkey = FALSE, main ="bty= 'f'")
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
# User defined
scatter3D(x, y, z, pch = 18, bty = "u", colkey = FALSE,
main ="bty= 'u'", col.panel ="steelblue", expand =0.4,
col.grid = "darkblue")
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
library(ggplot2)
qplot(iris$sepal.length,iris$sepal.width, col="red",
fill="iris$sepal.width")
ggplot(iris, aes(x=sepal.length))+
geom_histogram(aes(y=iris$sepal.width), binwidth=5,
col="black",fill="red")
library(dplyr)
tbl_df(iris)
glimpse(iris)
View(iris)
iris %>%
group_by(variety) %>%
summarise(avg = mean(sepal.width)) %>%
arrange
library(tidyr)
gather(iris, "new", "n", 2:4)
slice(iris, 10:15)
summarise_each(iris, funs(mean))
count(iris, variety, wt = sepal.length)
summarise(iris, avg = mean(sepal.length))
group_by(iris, variety)
library(stringr)
str_detect(iris$variety, "z")
library(MASS)
data=Cars93
data
attach(data)
names(data)
ggplot(data, aes(x=Price))+
geom_bar(binwidth=5, col="red", fill="blue")
ggplot(data, aes(x=RPM))+
geom_histogram(aes(y=..density..), col="red", fill="Black")+
geom_density(alpha=.2, fill = "pink")
ggplot(data, aes(x=Weight))+
geom_histogram(aes(y=..density..),binwidth = 2, colour = "black",
fill = "white")+
geom_density(alpha=.2, fill = "pink")
ggplot(df) +
geom_segment(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y, linetype = !xy_sign),
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal()
ggplot() +
geom_curve(data = df %>% filter(x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
geom_curve(data = df %>% filter(!x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature =-0.75, angle = 45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
ggplot(df) +
geom_curve(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
head(df2)
#Kaggle DS LR
train=read.csv("D:/sahubackup/GL/LR/Kaggle/train.csv")
train
test=read.csv("D:/sahubackup/GL/LR/Kaggle/test.csv")
test
library(ggplot2)
numberofNAs=length(which(is.na(train)==T))
if(numberofNAs>0)
{
cat('Number of missing values found: ', numberofNAs)
cat('\nRemoving missing values...')
train = train[complete.cases(train), ]
}
#par(mfrow=c(2,1), size=)
boxplot(train$x)
boxplot(test$x)
boxplot(train$x, main='X', sub=paste('Outliers: ',
boxplot.stats(train$x)$out))
boxplot(test$x, main='X', sub=paste('Outliers: ',
boxplot.stats(test$x)$out))
av=read.csv("D:/sahubackup/GL/LR/av.csv")
av
levels(av$group)
acgrp=ordered(av$group, levels=c("ctrl","trt1","trt2"))
acgrp
library(dplyr)
grp=group_by(av,group) %>%
summarise(count=n(),
mean=mean(weight, na.rm = TRUE,
sd=sd(weight,na.rm = TRUE)))
grp=av$group
grp
factor.grp=factor(grp)
factor.grp
attach(av)
grp = c("Mon","Fri","Mon","Wed","Wed","Sat")
factor.wday = factor(wday)
factor.wday
library(ggplot2)
ggplot(av, aes(x=group, y=weight, fill=group))+
geom_boxplot(order = c("ctrl", "trt1", "trt2"))+
theme_classic() +
theme(legend.position = "none")
boxplot(av)
boxplot(weight~group)
av
avgrp
names(av)
ggplot(av, aes(x = group, y = weight)) + ## Simple Box Plot -
Midsize has high variance
geom_boxplot()
TukeyHSD(res.aov)
library(multcomp)
summary(glht(res.aov, linfct = mcp(group = "Tukey")))
pairwise.t.test(av$weight, av$group,
p.adjust.method = "BH")
library(car)
leveneTest(weight ~ group, data = av)
confint(SLM, "PerOcc")
///////////////ANOVA-DentalHardness/////////
my_data=read.csv("D:/sahubackup/GL/Dental Hardness.csv")
attach(my_data)
View(my_data)
my_data$dentist<-factor(my_data$dentist)
my_data$method<-factor(my_data$method)
my_data$alloy<-factor(my_data$alloy)
my_data$temperature<-factor(my_data$temperature)
hist(my_data[my_data$temperature==1500,]$hardness)
hist(my_data[my_data$temperature==1600,]$hardness)
hist(my_data[my_data$temperature==1700,]$hardness)
shapiro.test(my_data[my_data$temperature==1500,]$hardness)$p.value
shapiro.test(my_data[my_data$temperature==1600,]$hardness)$p.value
shapiro.test(my_data[my_data$temperature==1700,]$hardness)$p.value
str(my_data)
library(car)
leveneTest(my_data$hardness~my_data$temperature)
t.test(my_data[my_data$alloy==1,]$hardness,my_data[my_data$alloy==2,]$
hardness,paired = FALSE)
wilcox.test(my_data[my_data$alloy==1,]$hardness,my_data[my_data$alloy=
=2,]$hardness,paired=FALSE)
pooledSD <- (((45-1)*(14688.12)+(45-1)*(25886.43))/(45+45-2))^0.5
pooledSD
power.t.test(n=45,delta=-68.58,pooledSD,
alternative="two.sided",sig.level=0.05)
power.t.test(power=0.8,delta=-68.58,sd=142.4334,
alternative="two.sided",sig.level=0.05)
#Test
aov1 <- aov(my_data$hardness~my_data$method)
summary(aov1)
hist(my_data[my_data$method==1,]$hardness)
hist(my_data[my_data$method==2,]$hardness)
hist(my_data[my_data$method==3,]$hardness)
shapiro.test(my_data[my_data$method==1,]$hardness)$p.value
shapiro.test(my_data[my_data$method==2,]$hardness)$p.value
shapiro.test(my_data[my_data$method==3,]$hardness)$p.value
# 2 are normal distribution and 1 is Not normal
leveneTest(my_data$hardness~my_data$method)
#variances are not equal
#both fail, so going for NonParametric
kruskal.test(my_data$hardness~my_data$method)
hist(my_data[my_data$dentist==1,]$hardness)
hist(my_data[my_data$dentist==2,]$hardness)
hist(my_data[my_data$dentist==3,]$hardness)
hist(my_data[my_data$dentist==4,]$hardness)
hist(my_data[my_data$dentist==5,]$hardness)
shapiro.test(my_data[my_data$dentist==1,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==2,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==3,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==4,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==5,]$hardness)$p.value
# 4 are normal distribution and 1 is Not normal
leveneTest(my_data$hardness~my_data$dentist)
#variances are not equal
#both fail, so going for NonParametric
kd=kruskal.test(my_data$hardness~my_data$dentist)
summary(kd)
//////
dataset$Age <- ifelse(is.na(dataset$Age),
ave(dataset$Age, FUN = function(x)
mean(x, na.rm = TRUE)),
dataset$Age)
//////////////Missing Value/////////////////////
##
======================================================================
=========================================
## EXPLORATORY DATA ANALYTICS
##
======================================================================
=========================================
## REFERENCES:
## An Introduction to Data Cleaning with R - Edwin de Jonge and Mark
Van Der Loo
## https://cran.r-project.org/doc/contrib/de_Jonge+van_der_Loo-
Introduction_to_data_cleaning_with_R.pdf
data = airquality
##
======================================================================
=======================================
## Univariate Analysis
##
======================================================================
=========================================
par(mfrow = c(2,1))
##
======================================================================
=======================================
## Bivariate Analysis
##
======================================================================
=========================================
plot(data)
##
======================================================================
=========================================
## EXPLORATORY DATA ANALYTICS - MISSING VALUES TREATMENT
##
======================================================================
=========================================
## Options Available:
## 1. Remove records having missing values
## 2. Impute values
## For now, we will omit Day and Month which are categorical variables
data1 = data[-c(5,6)]
summary(data1)
attach(data1)
## GUIDELINES:
## A safe maximum threshold for missing values in a particular column
is 5%.
## If missing data for a column > 5%, we need to consider leaving out
that variable
## OBSERVATIONS:
## Ozone has nearly 25% missing values
## OBSERVATIONS:
## Row 5 has 50% missing variables - will not be of much value
data1[5,]
## Keep only the rows with less than 30% missing values
low_miss_rows = data1[row_miss < 30,]
low_miss_rows
## NOTE: FOR THIS EXERCISE, WE ARE USING data1 DATASET WITH 153 ROWS -
NOT low_miss_rows!!!!
data_imputes = mice(data1, m = 5, maxit = 7, seed = 500)
## m: Number of times model should run, maxit: Max number of
iterations
summary(data_imputes)
## Since only numeric variables had missing values, mice used pmm
method
## Now let us first examine the values mice determined for Ozone
data_imputes$imp$Ozone
## OBSERVATIONS:
## For Temp, Iteration 3 and 4 is the one with most imputed values in
the middle
## which does not fit well with observed values - we can therefore
ignore the 3rd imputed dataset
library(funModeling) ## Ref:
https://blog.datascienceheroes.com/exploratory-data-analysis-data-
preparation-with-funmodeling/
summary(data)
summary(imputed_data)
densityplot(data_imputes)
## OBSERVATIONS:
## Red lines - Density of imputed data for each imputed dataset
## Blue line - Density of observed data
## We expect the Red and Blue distributions to be similar
## - Ozone and Wind has similar patterns for Red and Blue lines
## - Temp has similar patterns for Red and Blue lines - However
Observed data (Blue)
## has more variation than some of the Imputed datasets
## - For Solar.R, imputed values for 4 datasets are close to Observed
- Can ignore the other imputed dataset
##
======================================================================
====================
## MISING VALUE TREATMENT USING KNN METHOD FROM VIM PACKAGE
##
======================================================================
====================
library(VIM)
summary(data2)
plot_num(data[,1:4])
plot_num(data2)
## =====================================
## Working with Messy Data
## =====================================
## OBSERVATIONS:
## yearsmarried cannot be negative
## A 2 year old child cannot be married
## An 18 year old adult cannot be married for 20 years
## 221 year old married for 2 years???!!!
## 34 year old Child who is -7 ft tall??
library(editrules)
rule_violations
plot(rule_violations)
## OBSERVATIONS:
## - Two cases of Categorical violations involving Group and Status
## - If status == 'married', group should be 'adult' or
'elderly'
## - Rule violated in records 2 and 5
## - Two cases of Mixed Rules violations involving Status,
YearsMarried and Age
## - If status == 'married', age - yrsMarried >= 17
## - Rule violated in records 2 and 3
##
======================================================================
==============
## WORKING WITH DIFFERENT UNITS
##
======================================================================
==============
name = c("A","B","C","D","E")
height = c(170.00,1.74, 70.00, 168.00, 5.91)
unit = c("cm","m","inch","cm","ft")
physical = data.frame(name,height,unit)
physical
library(deducorrect)
cor$corrected
## *****************************************
## Working with Dates - Also covered in Intro to R
## *****************************************
Sys.time()
class(Sys.time())
time.list = as.POSIXlt(Sys.time())
unlist(time.list)
y <- strptime("01/02/2018",format="%d/%m/%Y")
y
library(lubridate)
## Other limitations
dmy("15 Feb 2018")
dmy("15 Febr 2018") ## Error since POSIX standard expects Feb and
not Febr
##
======================================================================
================
## CHARACTER MANIPULATION USING stringr PACKAGE
##
======================================================================
================
library(stringr)
##
======================================================================
==================
## Approximate String Matching
##
======================================================================
==================
////////////////PCA///////////////////////////
library(nFactors)
attach(cs)
csi=cs[,c("Sales","CompPrice","Income","Advertising","Population","Pri
ce","Age","Education")]
csi
ev = eigen(cor(csi)) # get eigenvalues
ev
EigenValue=ev$values
EigenValue
Factor=c(1,2,3,4,5,6,7,8)
Scree=data.frame(Factor,EigenValue)
plot(Scree,main="Scree Plot", col="Blue",ylim=c(0,4))
lines(Scree,col="Red")
library(psych)
Unrotate=principal(csi, nfactors=3, rotate="none")
print(Unrotate,digits=3)
UnrotatedProfile=plot(Unrotate,row.names(Unrotate$loadings))
Rotate=principal(csi,nfactors=3,rotate="varimax")
print(Rotate,digits=3)
RotatedProfile=plot(Rotate,row.names(Rotate$loadings),cex=1.0)
///////////////Practice////////////
library(plot3D)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
x
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, clab = c("Sepal", "Width (cm)"))
scatter3D(x, y, z, bty = "f", colkey = FALSE, main ="bty= 'f'")
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
# User defined
scatter3D(x, y, z, pch = 18, bty = "u", colkey = FALSE,
main ="bty= 'u'", col.panel ="steelblue", expand =0.4,
col.grid = "darkblue")
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
library(ggplot2)
qplot(iris$sepal.length,iris$sepal.width, col="red",
fill="iris$sepal.width")
ggplot(iris, aes(x=sepal.length))+
geom_histogram(aes(y=iris$sepal.width), binwidth=5,
col="black",fill="red")
library(dplyr)
tbl_df(iris)
glimpse(iris)
View(iris)
iris %>%
group_by(variety) %>%
summarise(avg = mean(sepal.width)) %>%
arrange
library(tidyr)
gather(iris, "new", "n", 2:4)
slice(iris, 10:15)
summarise_each(iris, funs(mean))
count(iris, variety, wt = sepal.length)
summarise(iris, avg = mean(sepal.length))
group_by(iris, variety)
library(stringr)
str_detect(iris$variety, "z")
library(MASS)
data=Cars93
data
attach(data)
names(data)
ggplot(data, aes(x=Price))+
geom_bar(binwidth=5, col="red", fill="blue")
ggplot(data, aes(x=RPM))+
geom_histogram(aes(y=..density..), col="red", fill="Black")+
geom_density(alpha=.2, fill = "pink")
ggplot(data, aes(x=Weight))+
geom_histogram(aes(y=..density..),binwidth = 2, colour = "black",
fill = "white")+
geom_density(alpha=.2, fill = "pink")
ggplot(df) +
geom_segment(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y, linetype = !xy_sign),
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal()
ggplot() +
geom_curve(data = df %>% filter(x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
geom_curve(data = df %>% filter(!x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature =-0.75, angle = 45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
ggplot(df) +
geom_curve(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
head(df2)
square.it=function(x)
{
square=x*x
return(Square)
}
hist(c(3, 5, 10, 10, 11, 12, 12, 14, 14, 14, 19))
pnorm(100,0.03)
#Kaggle DS LR
train=read.csv("D:/sahubackup/GL/LR/Kaggle/train.csv")
train
test=read.csv("D:/sahubackup/GL/LR/Kaggle/test.csv")
test
library(ggplot2)
numberofNAs=length(which(is.na(train)==T))
if(numberofNAs>0)
{
cat('Number of missing values found: ', numberofNAs)
cat('\nRemoving missing values...')
train = train[complete.cases(train), ]
}
#par(mfrow=c(2,1), size=)
boxplot(train$x)
boxplot(test$x)
boxplot(train$x, main='X', sub=paste('Outliers: ',
boxplot.stats(train$x)$out))
boxplot(test$x, main='X', sub=paste('Outliers: ',
boxplot.stats(test$x)$out))
av=read.csv("D:/sahubackup/GL/LR/av.csv")
av
levels(av$group)
acgrp=ordered(av$group, levels=c("ctrl","trt1","trt2"))
acgrp
library(dplyr)
grp=group_by(av,group) %>%
summarise(count=n(),
mean=mean(weight, na.rm = TRUE,
sd=sd(weight,na.rm = TRUE)))
grp=av$group
grp
factor.grp=factor(grp)
factor.grp
attach(av)
grp = c("Mon","Fri","Mon","Wed","Wed","Sat")
factor.wday = factor(wday)
factor.wday
library(ggplot2)
ggplot(av, aes(x=group, y=weight, fill=group))+
geom_boxplot(order = c("ctrl", "trt1", "trt2"))+
theme_classic() +
theme(legend.position = "none")
boxplot(av)
boxplot(weight~group)
av
avgrp
names(av)
ggplot(av, aes(x = group, y = weight)) + ## Simple Box Plot -
Midsize has high variance
geom_boxplot()
TukeyHSD(res.aov)
library(multcomp)
summary(glht(res.aov, linfct = mcp(group = "Tukey")))
pairwise.t.test(av$weight, av$group,
p.adjust.method = "BH")
library(car)
leveneTest(weight ~ group, data = av)
confint(SLM, "PerOcc")
//////////////////Practice1/////////////////
install.packages("ggplot2")
library(lattice)
t.test(30, mu=0.29)
install.packages("olsrr")
pnorm(0.8,100,10,1)
if(is.na(nterms)) {
print("nter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}
ap=read.csv("D:/sahubackup/GL/AirPassengers.csv")
start=head(ap,1)
start
end=tail(ap,1)
end
library(tseries)
library(forecast)
library(XML)
u = "http://en.wikipedia.org/wiki/World_population"
tables = readHTMLTable(u)
names(tables)
tables[[2]]
tmp = tables[[2]]
for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
genre_matrix2 <- as.data.frame(genre_matrix[-1,],
stringsAsFactors=FALSE) #remove first row, which was the genre list
for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
}
for(i in 1:5)
{
for(j in 1:2)
{
print(i*j);
}
}
movies=read.csv("D:/sahubackup/GL/ml-latest-small/movies.csv")
movies
df_genres=data.frame(movies[,3])
df_genres
i=nrow(movies$title)
i
str(movies)
nr=nrow(movies)
nr
mat_mov_gen=data.matrix(movies,rownames.force = NA)
mat_mov_gen
genres <- as.data.frame(movies$genres, stringsAsFactors=FALSE)
genres
library(data.table)
genres2 <- as.data.frame(tstrsplit(genres[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE)
genres2
colnames(genres2) <- c(1:10)
colnames(genres2)
genre_list <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western")
genre_matrix <- matrix(0,9743,18)
genre_matrix
genre_matrix[1,] <- genre_list
genre_matrix[1,]
colnames(genre_matrix) <- genre_list
colnames(genre_matrix)
for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
genre_matrix2 <- as.data.frame(genre_matrix[-1,],
stringsAsFactors=FALSE)
genre_matrix2
for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
}
years <- as.data.frame(movies$title, stringsAsFactors=FALSE)
library(data.table)
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))}
yt=movies$title
yt
class(yt)
ytc=as.character(yt)
ytc
class(ytc)
years <- as.data.frame(substr(substrRight(substrRight(ytc, 6),5),1,4))
years
search_matrix <- cbind(movies[,1], substr(movies[,2],1,nchar(ytc)-6),
years, genre_matrix2)
search_matrix
colnames(search_matrix) <- c("movieId", "title", "year", genre_list)
colnames(search_matrix)
write.csv(search_matrix, "search.csv")
search_matrix <- read.csv("search.csv", stringsAsFactors=FALSE)
search_matrix
subset(search_matrix, Action == 1 & year == 1995)$title
ratings=read.csv("D:/sahubackup/GL/ml-latest-small/ratings.csv")
links=read.csv("D:/sahubackup/GL/ml-latest-small/links.csv")
tags=read.csv("D:/sahubackup/GL/ml-latest-small/tags.csv")
binaryratings <- ratings
binaryratings
for (i in 1:nrow(binaryratings)){
if (binaryratings[i,3] > 3){
binaryratings[i,3] <- 1
}
else{
binaryratings[i,3] <- -1
}
}
for (i in 1:ncol(binaryratings2)){
binaryratings2[which(is.na(binaryratings2[,i]) == TRUE),i] <- 0
}
binaryratings2 = binaryratings2[,-1]
binaryratings2
for (c in 1:ncol(result)){
for (i in 1:nrow(result)){
if (result[i,c] < 0){
result[i,c] <- 0
}
else {
result[i,c] <- 1
}
}
}
library(reshape2)
#Create ratings matrix. Rows = userId, Columns = movieId
ratingmat <- dcast(ratings, userId~movieId, value.var = "rating",
na.rm=FALSE)
ratingmat <- as.matrix(ratingmat[,-1]) #remove userIds
library(recommenderlab)
install.packages("registry")
library(registry)
ratingmat <- as(ratingmat, "realRatingMatrix")
# Determine how similar the first four users are with each other
# create similarity matrix
similarity_users <- similarity(ratingmat[1:4, ],
method = "cosine",
which = "users")
as.matrix(similarity_users)
image(as.matrix(similarity_users), main = "User similarity")
# compute similarity between
# the first four movies
similarity_items <- similarity(ratingmat[, 1:4], method =
"cosine", which = "items")
as.matrix(similarity_items)
image(as.matrix(similarity_items), main = "Item similarity")
library(ggplot2)
views_per_movie <- colCounts(ratingmat) # count views for each movie
#Obtain recommendations
recom_result <- matrix(0,10)
for (i in 1:10){
recom_result[i] <- as.character(subset(movies,
movies$movieId ==
as.integer(recom_list[[1]][i]))$title)
}
# Evaluation:
evaluation_scheme <- evaluationScheme(ratingmat,
method="cross-validation",
k=5, given=3,
goodRating=5) #k=5 meaning a 5-
fold cross validation. given=3 meaning a Given-3 protocol
evaluation_results <- evaluate(evaluation_scheme,
method="UBCF",
n=c(1,3,5,10,15,20))
eval_results <- getConfusionMatrix(evaluation_results)[[1]]
eval_results
##star triangle
for(i in 1:5)
{
for(j in 1:2)
{
print("*");
}
}
ap=read.csv("D:/sahubackup/GL/AirPassengers.csv")
start=head(ap,1)
start
end=tail(ap,1)
end
frequency(ap)
findfrequency(ap)
class(ap)
tsap=ts(ap,start=c(1949,1),end=c(1960,12),frequency=365)
tsap
class(tsap)
findfrequency(tsap)
frequency(tsap)
plot(ap)
train <- ap[,1:132]
dim(ap)
library(caTools)
set.seed(123)
split=sample.split(ap,SplitRatio=0.8)
train=subset(ap,split==T)
test=subset(ap,split==F)
train
test
train_ts=ts(train, start=1949, frequency=12)
test_ts=ts(test, start=1960, frequency=12)
train_ts
test_ts
library(ggplot2) #Data Visualisation
library(ggfortify) #Data Visualisation
library(forecast)
decomposedres <- decompose(ap)
plot(decomposedres)
mean_baseline <- meanf(train_ts, h=12)
plot(mean_baseline, type="l")
lines(ap)
accuracy(mean_baseline, test)
sma <- ma(train_ts, order=12)
plot(sma, xlim=c(1949, 1960), ylim=c(0, 600), col="red")
lines(train)
pnorm(20,0.1,1)
pnorm(0.997,0.996,0.0033)
pnorm(15,0.6,15)
pnorm(40,65.16,10)-pnorm(50,65.16,10)
qnorm(0.99, mean = 65.16, sd=10)
pnorm(0.998,0.9563,0.0189)-pnorm(0.997,0.9563,0.0189)
1-dpois(0,lambda=3)
dpois(2,lambda=3)+dpois(3,lambda=3)+dpois(4,lambda=3)
dpois(6,lambda=4)
dpois(1,0.15)+dpois(0,0.15)
dpois(10,lambda=10)
pnorm(20.08,20.05,0.02,100)-pnorm(20.03,20.05,0.02,100)
pnorm(20.01,20.05,0.02,100)
pnorm(3.69,3.25,0.6)-pnorm(2.75,3.25,0.6)
dbinom(0,3,1/6)
dbinom(6,9,0.6)
dbinom(6,10,0.45)
pnorm(178000, 168000, 6324.55) - pnorm(158000, 168000, 6324.55)
ap=read.csv("D:/sahubackup/GL/Food Nutrition.csv")
library(ggplot2)
#par(margin(5,5,1,5),cex.lab=1.2, cex.axis=0.9)
par(mfrow=c(1,3))
plot(ap$Protein_.g.,col="blue", fill="red" )
barplot(ap$Carbohydrt_.g.,col="red")
pie(ap$Water_.g., main="Piechart", radius=1)
sub_data=ap[which(ap$Water_.g.>=30.0),]
sub_data
library(dplyr)
by_shr=group_by(ap,ap$Shrt_Desc)
by_shr
attach(ap)
sub_data1=subset(ap, Water_.g.>30.0)
sub_data1
sub_data1_ord=arrange(sub_data1,desc(Water_.g.))
sub_data1_ord
sub_data1_fil=filter(sub_data1, Protein_.g.>30,Lipid_Tot_.g.>26 )
sub_data1_fil
add = function(a,b)
{
a+b
}
add(3,4)
for(i in 1:5)
{
for(j in 1:2)
{
print(i*j);
}
}
f_c=function(f)
{
(9/5)*(f+32)
}
f_c(32)
for(i in 1:2)
{
for(j in 101:110)
{
print(i+j)
}
}
sq=function(a,b)
{
a*a+b*b
}
sq(3,4)
if(is.na(nterms)) {
print("nter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}
3
s1=lapply(sub_data1[,3:5],mean)
s1
s2=sapply(sub_data1[,3:5],mean)
s2
tapply(sub_data1$Water_.g.,sub_data1$Shrt_Desc, mean)
install.packages("plot3D")
library(plot3D)
detach(ap)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, ..., colvar = z, col = NULL, add = FALSE)
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
pnorm(44,40,3,lower.tail = FALSE)
qnorm(0.9087,40,3)
qnorm(0.025,0,1)
qnorm(0.975)
qnorm(0.025)
qnorm(0.95,0,1)
qnorm(0.95,0,1,lower.tail = FALSE)
qnorm(0.005,0,1)
qnorm(0.01,0,1,lower.tail = FALSE)
qnorm(0.01,0,1)
qnorm(0.95,0,1,lower.tail = FALSE)
pnorm(-1.25,0,1)
pnorm(2.5,0,1,lower.tail = FALSE)
pnorm(3.16,lower.tail = FALSE)
len <- 10
fibvals <- numeric(len)
fibvals[1] <- 1
fibvals[2] <- 1
for (i in 3:len) {
fibvals[i] <- fibvals[i-1]+fibvals[i-2]
}
/
////////////SmartEDA/////////////
nstall.packages("ISLR")
library("ISLR")
install.packages("SmartEDA")
library("SmartEDA")
## Load sample dataset from ISLR pacakge
Carseats= ISLR::Carseats
Overview of the data - Type = 1
ExpData(data=Carseats,type=1)