0% found this document useful (0 votes)
282 views

R Note

The document reads in CSV data on insurance claims, selects specific features for analysis, cleans the data, generates summary statistics, and performs additional analysis including grouping ages into buckets and generating frequency tables. It then saves the output to a CSV file and creates various plots of the data including boxplots and scatter plots of features.

Uploaded by

Suchismita Sahu
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
282 views

R Note

The document reads in CSV data on insurance claims, selects specific features for analysis, cleans the data, generates summary statistics, and performs additional analysis including grouping ages into buckets and generating frequency tables. It then saves the output to a CSV file and creates various plots of the data including boxplots and scatter plots of features.

Uploaded by

Suchismita Sahu
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 56

# Read data from CSV file

claim_data <- read.csv("D:/sahubackup/GL/Car-2.csv")

## Look at the first few rows.


head(claim_data)

## Choose 5 features only to work on.


work_data <- claim_data[,c("AGE", "MSTATUS", "SEX", "EDUCATION",
"RED_CAR")]

## Look at the structure


str(work_data)

## Look at the attributes of work_data$MSTATUS


attributes(work_data$MSTATUS)

## Correct the levels of MSTATUS


levels(work_data$MSTATUS) <- c("Yes", "No")

## Correct the levels using function from forcats package


work_data$SEX <- fct_collapse(work_data$SEX, F = c("z_F"))
work_data$SEX
## Now the structure of data is correct.
## Lets look at the summary statistics
summary(work_data)

## Lets find out how many people with age less than 18 are filing for
claims
work_data[ which(work_data$AGE < 18), ]
work_data
## Lets do some more analysis
## Group the age into buckets
## Add a new variable agegroup with these buckets
work_data$agegroup <- cut(work_data$AGE,
breaks = c(0,35,50, 100),
labels = c("less than 35", "35 to 50", "more
than 50"))
work_data$agegroup
## Now see the summary data with new field added
summary(work_data)

## Generate the frequency tables of RED_CAR and MSTATUS for agegroup


red_car_stats <- table(work_data$agegroup, work_data$RED_CAR)
red_car_stats
mstatus_stats <- table(work_data$agegroup, work_data$MSTATUS)
mstatus_stats
total_cars <- table(work_data$agegroup)
total_cars
## Combine the RED_CAR and MSTATUS into a dataframe
output <- cbind(total_cars, red_car_stats[,2], mstatus_stats[,1])
output <- data.frame(output)

## Check the attribute of the output variable


attributes(output)

## Update the column names of the features


colnames(output) <- c("Total_Cars", "Red_Cars", "Marital_Status")

## Print the output


output

output$red_car_percent <- (output$Red_Cars/output$Total_Cars * 100)


output$red_car_percent <- round(output$red_car_percent, 2)

## Print the output


output

## We can save the output as CSV


write.csv(output, "output.csv")

## boxplot for one variable (work_data$AGE)


## It specifies the outliers
boxplot(work_data$AGE)

## Plot more than one variables


plot(work_data$SEX, work_data$AGE)
plot(work_data$EDUCATION, work_data$AGE)

## Example of ggplot
ggplot(data = work_data, mapping = aes(x = SEX, y = AGE)) +
geom_boxplot( aes(colour = EDUCATION), outlier.colour = "red")

## Uber data analysis


uber=read.csv("D:/sahubackup/GL/Uber Dataset.csv")
uber
dim(uber)
anyNA(uber)
sum(is.na(uber))
sapply(uber, function(x) sum(is.na(x)))
uber$borough = as.factor(replace(as.character(uber$borough),
is.na(uber$borough),"Unknown"))
table(uber$borough)
installed.packages("lubridate")
library(lubridate)
uber$start_date = strptime(uber$pickup_dt,'%Y-%m-%d %H:%M')
uber$start_date
uber$start_month = month(uber$start_date)
uber$start_month
uber$start_day = day(uber$start_date)
uber$start_hour = hour(uber$start_date)
uber$wday = weekdays(uber$start_date)
uber = uber[,-14]
uber
attach(uber)
detach(uber)
unique(uber[which(uber$hday=="Y"),c("start_day","start_month")])
count(holiday)
table(uber$hday,uber$start_month)
names(uber)
## Uni-Variate Analysis
boxplot(uber$spd)
hist(uber$spd)
unique(uber,by=c('start_month', 'start_day'))
plot(aggregate(pickups~hday,data=uber, mean), type="b")

## Bi-Variate analysis
corrplot(cor(uber[,4:12]))
plot(uber$spd, uber$pickups, xlab= "speed", ylab="pickup", main
="pickup vs speed")
abline(lm(uber$pickups~uber$spd))
plot(aggregate(pickups~start_month,data=uber, sum), type="b")
uber %>%
filter(.,start_month !=2) %>%
ggplot(aes(x=start_day, y=pickups))+geom_bar(stat='identity')
plot(aggregate(pickups~start_hour,data=uber, sum), type="b")
ggplot(aes(x = reorder(wday, pickups), y = pickups), data = uber) +
geom_bar(aes(fill=pickups), width=0.5, stat = "identity") +
coord_flip()
ggplot(uber, aes(start_hour, pickups)) +
geom_jitter(alpha = 0.3, aes(colour = borough)) +
geom_smooth(aes(color = borough))
ggplot(uber, aes(start_hour, borough)) +
geom_jitter( alpha = 0.4, aes(color = pcp24 > 0)) +
geom_smooth(aes(color = pcp24 > 0))

## Coffee
install.packages("reshape", type="source")
install.packages("reshape2", type="source")
library(reshape)
library(reshape2)

library(reshape)
library(reshape2)
library(ggplot2)
library(plyr)
library(grid)
install.packages("gridExtra")
library(gridExtra)

coffee=read.csv("D:/sahubackup/GL/Coffee-1.csv")
coffee
dim(coffee)
attach(coffee)
histogram(Days_between_Purchase)

count_Brand<-count(coffee$Brand)
count_Brand
data_num <- as.data.frame(apply(coffee, 2, as.numeric))
data_num
ggplot(count_Brand,aes(Brand,Count))+geom_bar(stat = "Identity")

corcoffee=cor(coffee)
corcoffee
corrplot(corcoffee)
cor(Price_per_Packet,Income)
library(ggplot2)
ggplot(coffee, aes(x = Days_between_Purchase)) + geom_density()

par(mfrow=c(3,2))
barplot(brand$freq,names.arg=brand$x,main="BRAND")
barplot(edu$freq,names.arg=edu$x,main="Education")

cbind(count(coffee$sec),(count(coffee$sec))/sum(count(coffee$sec)$freq
))

cast(count(coffee[,c("sec","price_per_packet")], c("sec",
"price_per_packet"))
,sec~ price_per_packet)
coffee_new<-subset(coffee,Days_between_Purchase>1)
summary(coffee_new$Days_between_Purchase)
ggplot(coffee_new,aes(Brand,Days_between_Purchase))+geom_boxplot()

cost<-coffee[,c("Brand","Price_per_Packet")]
cost<-count(cost, c("Brand", "Price_per_Packet"))
cost<-cast(cost,Brand ~ Price_per_Packet)

par(mfrow=c(3,2))
barplot(Brand$freq,names.arg=Brand$x,main="BRAND")
barplot(Education$freq,names.arg=Education$x,main="Education")

qplot(coffee$Days_between_Purchase,data=coffee)

ggplot(coffee,aes(Brand,coffee$Days_between_Purchase))+geom_boxplot()

quantile(coffee$Days_between_Purchase)
q1<-6
q3<-17
iqr<-q3-q1
upper<-q3+(iqr*1.5)
upper

coffee_new<-subset(coffee,Days_between_Purchase<upper)
summary(coffee_new$Days_between_Purchase)

ggplot(coffee_new,aes(x=Brand,y=Days_between_Purchase))+geom_boxplot()

barplot(age$freq,names.arg=age$x,main="Age")
barplot(sec$freq,names.arg=sec$x,main="Social Economic Status")

barplot(income$freq,names.arg=income$x,main="Income")
barplot(price_cons$freq,names.arg=price_cons$x,main="Price Conscious")

/////////////HeartDisease/////////////
install.packages("car")
install.packages("tidyr")
install.packages("caret")
install.packages("broom")
install.packages("ROCR")
library(ggplot2)
library(car)
library(dplyr)
library(lattice)
library(tidyr)
library(caret)
library(MASS)
library(broom)
library(ROCR)

heart_data <- read.csv("D:/sahubackup/GL/heart.csv")


heart_data
str(heart_data)
head(heart_data)
names(heart_data)

heart_data$sex <- as.character(heart_data$sex)


heart_data$sex <- ifelse(heart_data$sex=="0",'female','male')
heart_data$sex

heart_data$cp <- factor(heart_data$cp)


heart_data$cp
heart_data$age
heart_data$chol
summary(heart_data)
attach(heart_data)
class(heart_data$cp)
class(heart_data$age)
ggplot(heart_data,
aes(x=heart_data$cp,y=heart_data$age))+geom_boxplot()
ggplot(heart_data,
aes(x=heart_data$cp,y=heart_data$chol))+geom_boxplot()
ggplot(heart_data, aes(x=cp,y=chol, fill=sex)) + geom_bar(stat =
"identity")
histogram(age, binwidth=5)
ggplot(heart_data, aes(x = age)) + geom_density(col="red")

ggplot(heart_data,aes(x = age)) + geom_histogram(bins =30,fill


="dodgerblue4") + theme_bw() + theme_classic() +ggtitle("age
distribution") +ylab("number of people")
ggplot(heart_data,aes(x = age)) + geom_density(fill ="dodgerblue4")
+ggtitle("age distribution") +ylab("number of people")
boxplot(heart_data$age,main ="boxplot of age for normality check",col
="dodgerblue4")
qqPlot(heart_data$age,main ="normality check for age",grid = F)
#sex

ggplot(heart_data,aes(x =sex)) + geom_bar(width = 0.2,fill ="green") +


geom_text(stat = 'count',aes(label =..count..),vjust =-0.5) +
theme_bw() + theme_classic() +ylab("number of count")

table(heart_data$chest_pain)
ggplot(heart_data,aes(x = cp)) +
geom_bar(width =0.2,fill ="red") +
geom_text(stat = 'count',aes(label =..count..),vjust = -0.5)
#rest_bp
class(trestbps)
ggplot(heart_data, aes(x=heart_data$trestbps,y=heart_data$chol,col =
"dodgerblue2",
main ="boxplot of
rest_bp",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data, aes(x=heart_data$cp,y=heart_data$trestbps,col =
"dodgerblue2",
main ="boxplot of
rest_bp",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data,aes(trestbps)) + geom_histogram(bins =20,fill
="green") +theme_bw() + theme_classic() +ggtitle("resp_bp")
ggplot(heart_data,aes(trestbps)) + geom_density(fill ="dodgerblue4") +
theme_bw() + theme_classic()+ggtitle("density plot of resp_bp")

#chol
ggplot(heart_data, aes(x=heart_data$chol,col = "dodgerblue2"
,main ="boxplot of
chol",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data,aes(chol)) +
geom_histogram(bins =20,fill ="green") +
ggtitle("chol")
ggplot(heart_data,aes(chol)) +
geom_density(fill ="dodgerblue4") +
theme_bw() + theme_classic()+ggtitle("density plot of chol")
table(heart_data$fasting_bloodsugar)
ggplot(heart_data,aes(x =factor(fasting_bloodsugar))) + geom_bar(width
= 0.1,fill ="green") + geom_text(stat = 'count',aes(label
=..count..),vjust =-0.5) + theme_bw() + theme_classic() +ylab("number
of count") + ggtitle("blood sugar") + title.center
ggplot(heart_data,aes(factor(fasting_bloodsugar))) + geom_bar(width =
0.2,fill ="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
fasting_bloodsugar") +title.center

ggplot(heart_data,aes(factor(rest_ecg))) + geom_bar(width = 0.2,fill


="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
rest_ecg") +title.center

#max heart-rate
ggplot(heart_data,aes(max_heartrate)) + geom_histogram(fill =
"dodgerblue4",alpha =0.5) + theme_bw()+theme_classic()
ggplot(heart_data,aes(max_heartrate)) + geom_density(fill =
"red",alpha =0.5) + theme_bw()+theme_classic()
boxplot(heart_data$max_heartrate,col ="lightblue",notch = T,main
="boxplot of the maximum heart rate")

ggplot(heart_data,aes(factor(excercise_angina))) + geom_bar(width =
0.2,fill ="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
exercise angina")

ggplot(heart_data,aes(factor(slope))) + geom_bar(width = 0.2,fill


="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
slope")

ggplot(heart_data,aes(factor(thal))) + geom_bar(width = 0.2,fill


="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of thal")

##Correlation
heart_data$sex
class(heart_data$sex)
str(heart_data)
heart_data_wcp=subset(heart_data,select= -2)
heart_data_wcp
str(heart_data_wcp)
cor(heart_data_wcp)
install.packages("corrplot")
library(corrplot)
corl=cor(heart_data_wcp)
corl
corrplot(corl, method = "pie", type = "lower")
round(table(heart_data$target)/nrow(heart_data), digits = 2) ##So in
our dataset 54% has a heart disease while the rest does not.
##MR
library(caTools)
set.seed(123)
split = sample.split(heart_data$target, SplitRatio = 0.8)
training_set = subset(heart_data, split == TRUE)
training_set
test_set = subset(heart_data, split == FALSE)
test_set

regressor = lm(formula = target ~ .,


data = training_set)
summary(regressor)

regressor2=lm(formula = target ~ cp,


data = training_set)
summary(regressor2)

regressor2=lm(formula = target ~ ca+cp,


data = training_set)
summary(regressor2)

regressor3=lm(formula = target ~ cp,


data = training_set)
summary(regressor3)

# Predicting the Test set results


y_pred = predict(regressor3, newdata = test_set)
y_pred

library(ggplot2)
ggplot() +
geom_point(aes(x = training_set$cp, y = training_set$target),
colour = 'red') +
geom_line(aes(x = training_set$cp, y = predict(regressor3, newdata =
training_set)),
colour = 'blue') +
ggtitle('cp vs target(Training set)') +
xlab('cp') +
ylab('target')

library(ggplot2)
ggplot() +
geom_point(aes(x = test_set$cp, y = test_set$target),
colour = 'red') +
geom_line(aes(x = training_set$cp, y = predict(regressor3, newdata =
training_set)),
colour = 'blue') +
ggtitle('cp vs target(Test set)') +
xlab('cp') +
ylab('target')

# Fitting Random Forest Regression to the dataset


install.packages('randomForest')
library(randomForest)
set.seed(1234)
class(heart_data$target)
heart_data$target <- as.factor(heart_data$target)
class(heart_data$target)

sample.index <- sample(2, nrow(heart_data), replace = T, prob =


c(0.6,0.4))

heart.train_RF <- heart_data[sample.index == 1,]


heart.test_RF <- heart_data[sample.index == 2,]
heart.train_RF
heart.test_RF
regressor = randomForest(x = heart_data[2],
y = heart_data$target,
ntree = 10)
attributes <- names(heart_data)
attributes <- attributes[!attributes %in% c("target")]
attributes1 <- paste(attributes, collapse = "+") #saves the column
names separated by a plus sign
formula.rf <- as.formula(paste("target", attributes1, sep = " ~ "))

model.rf <- randomForest(formula.rf, heart.train_RF, ntree = 1000,


importance = TRUE)
plot(model.rf)

# Variable Importance Table


var.imp <- data.frame(importance(model.rf, type=2))
var.imp
# make row names as columns
var.imp$Variables <- row.names(var.imp)
var.imp[order(var.imp$IncNodePurity, decreasing = T),]

# Predicting a new result with Random Forest Regression


y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Random Forest Regression results (higher resolution)

library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary),
colour = 'red') +
geom_line(aes(x = x_grid, y = predict(regressor, newdata =
data.frame(Level = x_grid))),
colour = 'blue') +
ggtitle('Truth or Bluff (Random Forest Regression)') +
xlab('Level') +
ylab('Salary')

full.mod <- glm(target~.,data =training_set,family =binomial)


summary(full.mod)
#checking the model accuracy
prob <- full.mod %>% predict(test_set,type ="response")
predicted.class1 <- ifelse(prob>0.5,1,0)
mean(predicted.class1==test_set$target)
#accuracy =0.85
#stepwise logistic regression in R
step.model <- full.mod %>% stepAIC(trace = F)
summary(step.model)
prob.step <- step.model %>% predict(test_set,type ="response")
predicted.class2 <- ifelse(prob.step>0.5,1,0)
mean(predicted.class2==test_set$target)

model_check <- glm(target~.,data =heart_data,family = binomial)


prob.check <- predict(model_check,type ="response")
my_data <- heart_data %>% select_if(is.numeric)
predictors <- colnames(my_data)
my_data <- my_data%>% mutate(logit = log(prob.check/(1-prob.check)))
%>%
gather(key = "predictors",value = "predicted.value",-logit)

#plotting the graph for cheking the linearity


ggplot(my_data,aes(x =logit,y =predicted.value)) + geom_point() +
geom_smooth(method ="loess") +
theme_classic() + theme_bw()+facet_wrap(~predictors,scale ="free_y")

/////////////////// CarSeats//////////////////////
rm(list = ls())
install.packages("MASS")
install.packages("psych")
install.packages("Boruta")
library(Boruta)
################################## Cleaning the R Environment and
Loading Libraries ##################################
rm(list = ls())
setwd("C:/OldStuff/")
wants <- c("readxl", "plyr", "data.table", "dplyr", "tidyr",
"stringi", "stringr", "lubridate",
"BLPestimatoR", "dummies", "zoo", "car", "caret", "MASS",
"caTools", "Boruta",
"lmtest")
has <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])
lapply(wants, require, character.only = TRUE)
rm("wants","has")

carSeats <- read.csv("Carseats.csv")

## User defined functions


calculateRMSE <- function(df, actual, predicted)
{
MSE <- sum((df[, actual] - df[, predicted])^2)/nrow(df)
RMSE <- round(sqrt(MSE), 3)
RMSE
}

## Variable Selection using Stepwise Regression


variableSelectionUsingStepWiseRegression <- function(modelData,
targetVar) {
baseFormula <- as.formula(paste0(targetVar, " ~ 1"))
base.mod <- lm(baseFormula, data= modelData) # base intercept only
model
allFormula <- as.formula(paste0(targetVar, " ~ ."))
all.mod <- lm(allFormula, data= modelData) # full model with all
predictors
stepMod <- step(base.mod, scope = list(lower = base.mod, upper =
all.mod), direction = "both", trace = 1, steps = 1000) # perform step-
wise algorithm
shortlistedVars <- setdiff(names(stepMod$model), targetVar)
shortlistedVars
}

calculateRSquared <- function(model, y) {


#y <- as.numeric(model$x)
y <- y[!is.na(y)]
id <- which(!is.na(y))
moy <- mean(y)
N <- length(y)
fittedVal <- y[id] - as.numeric(model$residuals)[id]
p <- length(model$coefficients)-1
SSres <- sum((y-fittedVal)^2, na.rm=TRUE)
SStot <-sum((y-moy)^2, na.rm=TRUE)
RSquared <- round((1-(SSres/SStot)), 4)
AdjustedRsquared <- round((1-(((1-RSquared)*(N-1))/(N-p-1))), 4)
modelAIC <- model$aic
return(data.frame(RSquared, AdjustedRsquared))
}

## Plotting Histogram on the Age variable to decide on the number of


Bins
Age <- carSeats$Age
hist_Age <- hist(Age, breaks = 5, xlim = c(min(Age), max(Age)), col =
"Steelblue3", right = F)

# Inference:- We can see that there is steep jump from the first bin
to the second bin,
# third to fourth bin and fourth to fifth bin.
# Therefore we will bin Age variable into 4 bins, 0-29, 30-49, 50-59,
60+

## Binning the Age variable as per the inference from the histogram
carSeats$ageGroup <- cut(carSeats$Age, breaks=c(0, 29, 49, 59, 1000),
labels=c("lessthan30","30to49","50to59",
"60+"))
carSeats$ageGroup <- as.character(carSeats$ageGroup)

## Log Transformation of Population


carSeats$log_Population <- log(carSeats$Population)

## Making education as a factor variable


carSeats$Education <- as.factor(carSeats$Education)

## Keeping only relevant variables in the data


delVars <- c("Population", "Age") #deleting Population and Age since
we are using the transformation of them
carSeats <- carSeats[, !colnames(carSeats) %in% delVars]

## Variable Selection using Boruta which is based on Random Forest


Algorithm
set.seed(456)
boruta <- Boruta(Sales~., data = carSeats, doTrace = 2)
print(boruta)
vars_Boruta <- getSelectedAttributes(boruta, withTentative = F)
vars_Boruta

## Variable Selection using Stepwise Regression


vars_Stepwise <- variableSelectionUsingStepWiseRegression(carSeats,
"Sales")
vars_Stepwise

#If we compare the results from Boruta and Stepwise we can see that,
there are 7
# common variables chosen by both the methods. Stepwise method chooses
one
# additional variable

## Splitting the data into Train and Test


sample_size <- floor(0.8 * nrow(carSeats))
set.seed(123)
trainIndicator <- sample(seq_len(nrow(carSeats)), size = sample_size)
trainData <- carSeats[trainIndicator, ]
testData <- carSeats[-trainIndicator, ]

## Approach 1: Model Fitting using the variables selected by Stepwise


method
targetVar <- "Sales"
Formula <- as.formula(paste0(targetVar," ~ ", paste(vars_Stepwise,
collapse = " + ")))
Fit_Stepwise <- lm(Formula, data = trainData)
summary(Fit_Stepwise)

# Checking for Multicollinearity using VIF


vif(Fit_Stepwise) # GVIF<2 suggests that the model doesn't suffer from
Multicollinearity

# Checking for Heteroscedasticity using BP Test


bptest(Fit_Stepwise) # p-value>0.05 suggests that the residuals are
Homoscedastic

# Checking for Auto Correlation


dwtest(Fit_Stepwise) # dw ~ 2 suggests that there is not much auto-
correlation among error terms

# Checking variable importance


varImportance_Stepwise <- varImp(Fit_Stepwise)
varImportance_Stepwise$Vars <- rownames(varImportance_Stepwise)
rownames(varImportance_Stepwise) <- NULL
varImportance_Stepwise$Overall <-
round(varImportance_Stepwise$Overall, 2)
varImportance_Stepwise <- varImportance_Stepwise[order(-
varImportance_Stepwise$Overall), c("Vars", "Overall")]

## Approach 2: Model Fitting using the variables selected by Boruta


method
targetVar <- "Sales"
Formula <- as.formula(paste0(targetVar," ~ ", paste(vars_Boruta,
collapse = " + ")))
Fit_Boruta <- lm(Formula, data = trainData)
summary(Fit_Boruta)

# Checking for Multicollinearity using VIF


vif(Fit_Boruta) # GVIF<2 suggests that the model doesn't suffer from
Multicollinearity

# Checking for Heteroscedasticity using BP Test


bptest(Fit_Boruta) # p-value>0.05 suggests that the residuals are
Homoscedastic

# Checking for Auto Correlation


dwtest(Fit_Boruta) # dw ~ 2 suggests that there is not much auto-
correlation among error terms

# Checking variable importance


varImportance_Boruta <- varImp(Fit_Boruta)
varImportance_Boruta$Vars <- rownames(varImportance_Boruta)
rownames(varImportance_Boruta) <- NULL
varImportance_Boruta$Overall <- round(varImportance_Boruta$Overall, 2)
varImportance_Boruta <- varImportance_Boruta[order(-
varImportance_Boruta$Overall), c("Vars", "Overall")]

## Final Model: Fitting using only significant variables


targetVar <- "Sales"
vars <- c("CompPrice", "Income", "Advertising", "Price", "ShelveLoc",
"ageGroup")
Formula <- as.formula(paste0(targetVar," ~ ", paste(vars, collapse = "
+ ")))
Fit <- lm(Formula, data = trainData)
summary(Fit)

# Checking for Multicollinearity using VIF


vif(Fit) # GVIF<2 suggests that the model doesn't suffer from
Multicollinearity

# Checking for Heteroscedasticity using BP Test


bptest(Fit) # p-value>0.05 suggests that the residuals are
Homoscedastic

# Checking for Auto Correlation


dwtest(Fit) # dw ~ 2 suggests that there is not much auto-correlation
among error terms

# Checking variable importance


varImportance <- varImp(Fit)
varImportance$Vars <- rownames(varImportance)
rownames(varImportance) <- NULL
varImportance$Overall <- round(varImportance$Overall, 2)
varImportance <- varImportance[order(-varImportance$Overall),
c("Vars", "Overall")]

## Checking for RMSE on the Train Data


trainData$Sales_Pred_Stepwise <- fitted(Fit_Stepwise)
trainData$Sales_Pred_Boruta <- fitted(Fit_Boruta)
trainData$Sales_Pred_FinalModel <- fitted(Fit)

trainRMSE_Stepwise <- calculateRMSE(trainData, "Sales",


"Sales_Pred_Stepwise")
trainRMSE_Boruta <- calculateRMSE(trainData, "Sales",
"Sales_Pred_Boruta")
trainRMSE_FinalModel <- calculateRMSE(trainData, "Sales",
"Sales_Pred_FinalModel")

## Prediction on the test data using all 3 models


pred_Stepwise <- predict(Fit_Stepwise, testData)
testData$Sales_Pred_Stepwise <- pred_Stepwise

pred_Boruta <- predict(Fit_Boruta, testData)


testData$Sales_Pred_Boruta <- pred_Boruta
pred_FinalModel <- predict(Fit, testData)
testData$Sales_Pred_FinalModel <- pred_FinalModel

testRMSE_Stepwise <- calculateRMSE(testData, "Sales",


"Sales_Pred_Stepwise")
testRMSE_Boruta <- calculateRMSE(testData, "Sales",
"Sales_Pred_Boruta")
testRMSE_FinalModel <- calculateRMSE(testData, "Sales",
"Sales_Pred_FinalModel")

## Preparing Model Diagnostics Output for Comparison


outputData_Stepwise <- calculateRSquared(Fit_Stepwise, trainData[,
targetVar])
outputData_Stepwise$Train_RMSE <- trainRMSE_Stepwise
outputData_Stepwise$Test_RMSE <- testRMSE_Stepwise
outputData_Stepwise$VarSelectionMethod <- "Stepwise"

outputData_Boruta <- calculateRSquared(Fit_Boruta, trainData[,


targetVar])
outputData_Boruta$Train_RMSE <- trainRMSE_Boruta
outputData_Boruta$Test_RMSE <- testRMSE_Boruta
outputData_Boruta$VarSelectionMethod <- "Boruta"

outputData_FinalModel <- calculateRSquared(Fit, trainData[,


targetVar])
outputData_FinalModel$Train_RMSE <- trainRMSE_FinalModel
outputData_FinalModel$Test_RMSE <- testRMSE_FinalModel
outputData_FinalModel$VarSelectionMethod <- "FinalModel"

outputData <- rbind(outputData_FinalModel, outputData_Stepwise,


outputData_Boruta)
rm(outputData_Stepwise, outputData_Boruta, outputData_FinalModel)

///////////////// CArdioFitness////////////
r<-"hello"
r
w="C://Users//00002998//CardioGoodFitness.csv"
ds=read.csv(w,1)
ds
library(caTools)
set.seed(123)
library(caTools)
set.seed(123)
split=sample.split(ds$Income, SplitRatio = 2/3)
training_set=subset(ds,split==TRUE)
test_set=subset(ds,split=FALSE)
training_set
dim(ds)
dim(training_set)
names(ds)
str(ds)
ds[1:10,]
ds[1:10,"Sepal.Length"]
ds[1:10,"Product"]
summary(ds)
table(ds$Product)
table(Product, Gender)
table(ds$Product, ds$Gender)
boxplot(Age~Product, horizontal=TRUE, col=c("Green","Red"))
boxplot(ds$Age~ds$Product, horizontal=TRUE, col=c("Green","Red"))
attach(ds)
table(Product, Gender)
by (m, INDICES=Product, FUN = summary)
summary(ds)
by(ds, indices=Product, FUN=summary)
by(ds, INDICES=Product, FUN=summary)
rpivotTable(ds)
install.rpivottable
library(lattice)
histogram(~Miles|factor(Product),data=ds)
cor(Miles,Usage)
Model=lm(Miles~Usage, data=ds)
summary(Model)
pie(table(Product, Gender))
pie(table(Product))
plot(density(Product))
plot(density(Salary))
plot(density(Income))
file="C://Users//00002998//CardioGoodFitness.csv"MyDataset=read.csv(fi
le,header=TRUE)
file="C://Users//00002998//CardioGoodFitness.csv"
MyDataset=read.csv(file,header=TRUE)
MyDataset
plot(Income, Gender)
plot(Income, Product)
plot(ds)
library("party")
library(ctree)
ds_ctree=ctree(Product~, Product)
file="C://Users//00002998//iris_flowers_new.csv"
iris=read.csv(file,1)
iris
iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data=iris)
install.packages("party")
library("party")
iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data=iris)
iris_ctree <- ctree(Flower ~ Sepal.Length + Sepal.Width + Petal.Length
+ Petal.Width, data=iris)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iri)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iri)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iris)
iris_ctree <- ctree(flower ~ sepal.length + sepal.width +
petal.length,data=iris)
iris_ctree
plot(iris_ctree)
boxplot()
plot(jitter(sepal.length), jitter(sepal.width))
attach(iris)
plot(jitter(sepal.length), jitter(sepal.width))
smoothScatter(sepal.length, sepal.width)
distMatrix <- as.matrix(dist(iris[,1:4]))
heatmap(distMatrix)
iris_pdf=pdf("iris.pdf")
heat=heatmap(distMatrix)
iris_pdf=pdf("heat.pdf")
iris_pdf
set.seed(1234)
split=sample.split(iris, SplitRatio = 2/3)
training_set=subset(iris,split==TRUE)
test_set=subset(iris,split=FALSE)
split=sample.split(iris, SplitRatio = 2/3)
training_set=subset(iris,split==TRUE)
test_set=subset(iris,split=FALSE)
training_set
myFormula=flower ~ sepal.length + sepal.width + petal.length,data=iris
myFormula=flower ~ sepal.length + sepal.width + petal.length
myf_ctree=ctree(myFormula, data=training_set)
table(predict(myf_ctree), training_set$flower)
print(myf_ctree)
plot(myf_ctree)
plot(myf_ctree,type="simple")
test_pred=predict(myf_ctree, newdata = test_set)
table(test_pred, test_set$flower)
setwd ("C:/Users/00002998")
/
setwd ("C:/Users/00002998/R Programming")
getwd()
setwd("D:/sahubackup/GL/R Programming")
getwd()

//////////////ANOVA///////////////
library(plot3D)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
x
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, clab = c("Sepal", "Width (cm)"))
scatter3D(x, y, z, bty = "f", colkey = FALSE, main ="bty= 'f'")
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
# User defined
scatter3D(x, y, z, pch = 18, bty = "u", colkey = FALSE,
main ="bty= 'u'", col.panel ="steelblue", expand =0.4,
col.grid = "darkblue")
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
library(ggplot2)
qplot(iris$sepal.length,iris$sepal.width, col="red",
fill="iris$sepal.width")
ggplot(iris, aes(x=sepal.length))+
geom_histogram(aes(y=iris$sepal.width), binwidth=5,
col="black",fill="red")
library(dplyr)
tbl_df(iris)
glimpse(iris)
View(iris)
iris %>%
group_by(variety) %>%
summarise(avg = mean(sepal.width)) %>%
arrange
library(tidyr)
gather(iris, "new", "n", 2:4)
slice(iris, 10:15)
summarise_each(iris, funs(mean))
count(iris, variety, wt = sepal.length)
summarise(iris, avg = mean(sepal.length))
group_by(iris, variety)
library(stringr)
str_detect(iris$variety, "z")
library(MASS)
data=Cars93
data
attach(data)
names(data)
ggplot(data, aes(x=Price))+
geom_bar(binwidth=5, col="red", fill="blue")

ggplot(data, aes(x=RPM))+
geom_histogram(aes(y=..density..), col="red", fill="Black")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x=Weight))+
geom_histogram(aes(y=..density..),binwidth = 2, colour = "black",
fill = "white")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +


geom_text(stat = "count")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +


geom_boxplot() +
guides(fill = FALSE)

ggplot(data, aes(x = Horsepower, y = MPG.city, colour = Cylinders)) +


geom_point()

ggplot(data, aes(x = Horsepower, y = MPG.city)) +


geom_point() +
facet_wrap( ~ Cylinders, ncol = 3)
View(Cars93)

df <- data_frame(x.to = c( 2, 3, 3, 2,-2,-3,-3,-2),


y.to = c( 3, 2,-2,-3,-3,-2, 2, 3),
x = 0,
y = 0,
x_gt_y = abs(x.to) > abs(y.to),
xy_sign = sign(x.to*y.to) == 1,
x_gt_y_equal_xy_sign = x_gt_y == xy_sign)
df

ggplot(df) +
geom_segment(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y, linetype = !xy_sign),
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal()

ggplot() +
geom_curve(data = df %>% filter(x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
geom_curve(data = df %>% filter(!x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature =-0.75, angle = 45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
ggplot(df) +
geom_curve(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)

df2 <- data.frame(supp=rep(c("VC", "OJ"), each=3),


dose=rep(c("D0.5", "D1", "D2"),2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))

head(df2)

ggplot(data=df2, aes(x=dose, y=len, group=supp)) +


geom_line(linetype="dashed", color="blue", size=1.2)+
geom_point()

data_summary <- function(data, varname, groupnames){


require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}

df3 <- data_summary(ToothGrowth, varname="len",


groupnames=c("supp", "dose"))
head(df3)

ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +


geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()

# Use position_dodge to move overlapped errorbars horizontally


ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +
geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1,
position=position_dodge(0.05)) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()
View(ToothGrowth)

long_DF <- DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)


head(long_DF, 24)

DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)


DF %>% gather(Quarter, Revenue, -Group, -Year)
DF %>% gather(Quarter, Revenue, 3:6)
DF %>% gather(Quarter, Revenue, Qtr.1, Qtr.2, Qtr.3, Qtr.4)
square.it=function(x)
{
square=x*x
return(Square)
}
hist(c(3, 5, 10, 10, 11, 12, 12, 14, 14, 14, 19))
pnorm(100,0.03)

#Kaggle DS LR
train=read.csv("D:/sahubackup/GL/LR/Kaggle/train.csv")
train
test=read.csv("D:/sahubackup/GL/LR/Kaggle/test.csv")
test
library(ggplot2)
numberofNAs=length(which(is.na(train)==T))
if(numberofNAs>0)
{
cat('Number of missing values found: ', numberofNAs)
cat('\nRemoving missing values...')
train = train[complete.cases(train), ]
}
#par(mfrow=c(2,1), size=)
boxplot(train$x)
boxplot(test$x)
boxplot(train$x, main='X', sub=paste('Outliers: ',
boxplot.stats(train$x)$out))
boxplot(test$x, main='X', sub=paste('Outliers: ',
boxplot.stats(test$x)$out))

av=read.csv("D:/sahubackup/GL/LR/av.csv")
av

levels(av$group)
acgrp=ordered(av$group, levels=c("ctrl","trt1","trt2"))
acgrp
library(dplyr)
grp=group_by(av,group) %>%
summarise(count=n(),
mean=mean(weight, na.rm = TRUE,
sd=sd(weight,na.rm = TRUE)))
grp=av$group
grp
factor.grp=factor(grp)
factor.grp
attach(av)
grp = c("Mon","Fri","Mon","Wed","Wed","Sat")
factor.wday = factor(wday)
factor.wday

library(ggplot2)
ggplot(av, aes(x=group, y=weight, fill=group))+
geom_boxplot(order = c("ctrl", "trt1", "trt2"))+
theme_classic() +
theme(legend.position = "none")
boxplot(av)
boxplot(weight~group)
av
avgrp
names(av)
ggplot(av, aes(x = group, y = weight)) + ## Simple Box Plot -
Midsize has high variance
geom_boxplot()

res.aov <- aov(weight ~ group, data = av)


res.aov
summary(res.aov)

TukeyHSD(res.aov)
library(multcomp)
summary(glht(res.aov, linfct = mcp(group = "Tukey")))
pairwise.t.test(av$weight, av$group,
p.adjust.method = "BH")
library(car)
leveneTest(weight ~ group, data = av)

#two way anova


tanv=read.csv("D:/sahubackup/GL/ml-latest-small/tav.csv")
tanv
attach(tanv)
unique(dose)
table(supp,dose)

ggplot(tanv, aes(x = dose, y = len,color=supp)) + ## Simple Box


Plot - Midsize has high variance
geom_boxplot()

ggplot(tanv, aes(x = dose, y = len,color=supp))+


geom_boxplot(len ~ supp * dose)

res.aov2 <- aov(len ~ supp + dose, data = tanv)


summary(res.aov2)
res.aov3 <- aov(len ~ supp * dose, data = tanv)
res.aov3 <- aov(len ~ supp + dose + supp:dose, data = tanv)
summary(res.aov3)

models <- regsubsets(Fertility~.,


really.big=TRUE,method="exhaustive",data = swissdata, nvmax = 5)
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)

get_model_formula <- function(id, object, outcome){

models <- summary(object)$which[id,-1]

predictors <- names(which(models == TRUE))


predictors <- paste(predictors, collapse = "+")

as.formula(paste0(outcome, "~", predictors))


}
model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models, "Fertility")
%>%
map(get_cv_error, data = swissdata) %>%
unlist()
cv.errors
Price=Factor(PL,levels=c(1,2,3), labels=c(high,med,low))
interraction.plot(Price, Advertisement, Sales, col=c(r,y,b))
library(MASS)
library(reshape2)
data=data("Boston", package="MASS")
data
View(Boston)
pairs(Boston)
bosmelt <- melt(Boston, id="crim")
ggplot(bosmelt, aes(x=value, y=crim))+
facet_wrap(~variable, scales="free")+
geom_point()

confint(SLM, "PerOcc")

///////////////ANOVA-DentalHardness/////////
my_data=read.csv("D:/sahubackup/GL/Dental Hardness.csv")
attach(my_data)
View(my_data)

my_data$dentist<-factor(my_data$dentist)
my_data$method<-factor(my_data$method)
my_data$alloy<-factor(my_data$alloy)
my_data$temperature<-factor(my_data$temperature)

hist(my_data[my_data$temperature==1500,]$hardness)
hist(my_data[my_data$temperature==1600,]$hardness)
hist(my_data[my_data$temperature==1700,]$hardness)

shapiro.test(my_data[my_data$temperature==1500,]$hardness)$p.value
shapiro.test(my_data[my_data$temperature==1600,]$hardness)$p.value
shapiro.test(my_data[my_data$temperature==1700,]$hardness)$p.value
str(my_data)

library(car)
leveneTest(my_data$hardness~my_data$temperature)

aov1 <- aov(my_data$hardness~my_data$temperature)


summary(aov1)
power.anova.test(groups=3,n=30,between.var = 41089, within.var =
20792, sig.level = 0.05)
kruskal.test(my_data$hardness~my_data$temperature)
t.test(my_data$hardness, mu=720, alternative="t", conf.level=0.95)
wilcox.test(my_data$hardness, mu=720)
power.t.test(n=90,delta=-21.778,sd=145.7678,
alternative="two.sided",sig.level=0.05)
power.t.test(delta=-21.778,sd=145.7678,power=.8,
alternative="two.sided",sig.level=0.05)

t.test(my_data[my_data$alloy==1,]$hardness,my_data[my_data$alloy==2,]$
hardness,paired = FALSE)
wilcox.test(my_data[my_data$alloy==1,]$hardness,my_data[my_data$alloy=
=2,]$hardness,paired=FALSE)
pooledSD <- (((45-1)*(14688.12)+(45-1)*(25886.43))/(45+45-2))^0.5
pooledSD
power.t.test(n=45,delta=-68.58,pooledSD,
alternative="two.sided",sig.level=0.05)
power.t.test(power=0.8,delta=-68.58,sd=142.4334,
alternative="two.sided",sig.level=0.05)

#Test
aov1 <- aov(my_data$hardness~my_data$method)
summary(aov1)

hist(my_data[my_data$method==1,]$hardness)
hist(my_data[my_data$method==2,]$hardness)
hist(my_data[my_data$method==3,]$hardness)

shapiro.test(my_data[my_data$method==1,]$hardness)$p.value
shapiro.test(my_data[my_data$method==2,]$hardness)$p.value
shapiro.test(my_data[my_data$method==3,]$hardness)$p.value
# 2 are normal distribution and 1 is Not normal
leveneTest(my_data$hardness~my_data$method)
#variances are not equal
#both fail, so going for NonParametric
kruskal.test(my_data$hardness~my_data$method)

hist(my_data[my_data$dentist==1,]$hardness)
hist(my_data[my_data$dentist==2,]$hardness)
hist(my_data[my_data$dentist==3,]$hardness)
hist(my_data[my_data$dentist==4,]$hardness)
hist(my_data[my_data$dentist==5,]$hardness)

shapiro.test(my_data[my_data$dentist==1,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==2,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==3,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==4,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==5,]$hardness)$p.value
# 4 are normal distribution and 1 is Not normal
leveneTest(my_data$hardness~my_data$dentist)
#variances are not equal
#both fail, so going for NonParametric
kd=kruskal.test(my_data$hardness~my_data$dentist)
summary(kd)

//////
dataset$Age <- ifelse(is.na(dataset$Age),
ave(dataset$Age, FUN = function(x)
mean(x, na.rm = TRUE)),
dataset$Age)

//////////////Missing Value/////////////////////
##
======================================================================
=========================================
## EXPLORATORY DATA ANALYTICS
##
======================================================================
=========================================

## REFERENCES:
## An Introduction to Data Cleaning with R - Edwin de Jonge and Mark
Van Der Loo
## https://cran.r-project.org/doc/contrib/de_Jonge+van_der_Loo-
Introduction_to_data_cleaning_with_R.pdf

data = airquality

## Explore Structure and Summary of Input data


str(data)
summary(data)

##
======================================================================
=======================================
## Univariate Analysis
##
======================================================================
=========================================
par(mfrow = c(2,1))

hist(data$Ozone, main = "Ozone Distribution", xlab = "Ozone")


boxplot(data$Ozone, horizontal = TRUE)
boxplot.stats(data$Ozone)

hist(data$Solar.R, main = "Solar.R Distribution", xlab = "Solar.R")


boxplot(data$Solar.R, horizontal = TRUE)
boxplot.stats(data$Solar.R)

hist(data$Wind, main = "Wind Distribution", xlab = "Wind")


boxplot(data$Wind, horizontal = TRUE)
boxplot.stats(data$Wind)

hist(data$Temp, main = "Temp Distribtion", xlab = "Temp")


boxplot(data$Temp, horizontal = TRUE)
boxplot.stats(data$Temp)

##
======================================================================
=======================================
## Bivariate Analysis
##
======================================================================
=========================================

plot(data)

##
======================================================================
=========================================
## EXPLORATORY DATA ANALYTICS - MISSING VALUES TREATMENT
##
======================================================================
=========================================

## Options Available:
## 1. Remove records having missing values
## 2. Impute values

## For now, we will omit Day and Month which are categorical variables
data1 = data[-c(5,6)]
summary(data1)

## Randomly insert 10 missing values in Wind and Temp columns


n = nrow(data1)
n
set.seed(100)
for(i in 3:ncol(data1)) {
data1[sample(1:n, 10, replace = FALSE), i] = NA
}

summary(data1) ## Note 10 Missing values introduced in Wind and Temp

## Let us examine the rows with missing values - Incomplete Rows


data1[!complete.cases(data1),]

attach(data1)

## GUIDELINES:
## A safe maximum threshold for missing values in a particular column
is 5%.
## If missing data for a column > 5%, we need to consider leaving out
that variable

## Build a function to calculate percentage of missing values in


Columns and Rows
pMiss = function(x){
sum(is.na(x))/length(x)*100
}

## Find Percentage of missing values in each column


col_miss = apply(data1,2,pMiss) ## 2 is for Columns
col_miss

## OBSERVATIONS:
## Ozone has nearly 25% missing values

## Find Percentage of missing values in each Row


row_miss = apply(data1,1,pMiss) ## 1 is for Rows
row_miss

## OBSERVATIONS:
## Row 5 has 50% missing variables - will not be of much value
data1[5,]

## Identify rows with high missing values


high_miss_rows = data1[row_miss > 25,]
high_miss_rows ## 10 Rows have more than 25% missing values

## Keep only the rows with less than 30% missing values
low_miss_rows = data1[row_miss < 30,]
low_miss_rows

## Using mice package


library(mice)
md.pattern(data1)

## IMPUTING MISSING VALUES USING mice PACKAGE


## If any variable contains missing values, the mice package regresses
## it over the other variables and predicts the missing values.
## Some of the available models in mice package are:
## * PMM (Predictive Mean Matching) - suitable for numeric
variables
## * logreg(Logistic Regression) - suitable for categorical
variables with 2 levels
## * polyreg(Bayesian polytomous regression) - suitable for
categorical variables with more than or equal to two levels
## * Proportional odds model - suitable for ordered categorical
variables with more than or equal to two levels

## NOTE: FOR THIS EXERCISE, WE ARE USING data1 DATASET WITH 153 ROWS -
NOT low_miss_rows!!!!
data_imputes = mice(data1, m = 5, maxit = 7, seed = 500)
## m: Number of times model should run, maxit: Max number of
iterations

summary(data_imputes)

## Methods mice used for imputing


data_imputes$method

## Since only numeric variables had missing values, mice used pmm
method

## What are the values determined for each variable?


data_imputes$imp

## Now let us first examine the values mice determined for Ozone
data_imputes$imp$Ozone

## Before inserting the values, let us look at rows 5, 10 and 25 -


They all have missing values
data1[c(5,10,25),]

## Which of the 5 datasets created should we use?


stripplot(data_imputes, pch = 20, cex = 1.2)

## OBSERVATIONS:
## For Temp, Iteration 3 and 4 is the one with most imputed values in
the middle
## which does not fit well with observed values - we can therefore
ignore the 3rd imputed dataset

library(funModeling) ## Ref:
https://blog.datascienceheroes.com/exploratory-data-analysis-data-
preparation-with-funmodeling/

plot_num(data) ## Overall Iteration 5 fits the original distribution


the closest
## Impute Data using 'complete' function from mice package
imputed_data = complete(data_imputes, 5)

## Let us look at the same rows 5, 10 and 25


imputed_data[c(5,10,25),]

data[c(9,25,31,40,48,55,60), -(5:6)] ## Original Data without Month


and Day columns

data1[c(9,25,31,40,48,55,60),] ## Data with Randomly inserted


missing values

imputed_data[c(9,25,31,40,48,55,60),] ## Data with Imputation for


missing values

summary(data)

summary(imputed_data)

## Inspecting distribution of original and imputed data


#xyplot(data_imputes, Ozone ~ Wind + Temp + Solar.R,
# pch = 18, cex =1)

densityplot(data_imputes)

## OBSERVATIONS:
## Red lines - Density of imputed data for each imputed dataset
## Blue line - Density of observed data
## We expect the Red and Blue distributions to be similar
## - Ozone and Wind has similar patterns for Red and Blue lines
## - Temp has similar patterns for Red and Blue lines - However
Observed data (Blue)
## has more variation than some of the Imputed datasets
## - For Solar.R, imputed values for 4 datasets are close to Observed
- Can ignore the other imputed dataset

##
======================================================================
====================
## MISING VALUE TREATMENT USING KNN METHOD FROM VIM PACKAGE
##
======================================================================
====================

summary(data[,3:4]) ## Dataframe with no missing values for Wind and


Temp

summary(data1[,3:4]) ## Dataframe with missing values introduced for


Wind and Temp
data1[!complete.cases(data1[,3:4]),]

library(VIM)

## Impute missing values using KNN method


data2 = kNN(data1)

summary(data2)

data[c(9,25,31,40,48,55,60), -(5:6)] ## Original Data without Month


and Day columns

data1[c(9,25,31,40,48,55,60),] ## Data with Randomly inserted


missing values

data2[c(9,25,31,40,48,55,60),1:4] ## Data with Imputation for


missing values

plot_num(data[,1:4])

plot_num(data2)

## =====================================
## Working with Messy Data
## =====================================

## Let us create a dataset with outliers


age = c(21,2,18,221,34)
group = c("adult","child","adult","elderly","child")
height = c(6.0, 3,5.7,5, -7)
status = c("single", "married", "married","widowed", "married")
yearsmarried = c(-1,0,20,2,3)

## Build a dataframe using the vectors created above


people = data.frame(age,group,height,status,yearsmarried)
people

## OBSERVATIONS:
## yearsmarried cannot be negative
## A 2 year old child cannot be married
## An 18 year old adult cannot be married for 20 years
## 221 year old married for 2 years???!!!
## 34 year old Child who is -7 ft tall??

library(editrules)

## Set Age Rule


E_age = editset(c("age >= 0", "age <= 120"))

## Which records violate the Age Rule?


violatedEdits(E_age,people) ## Record 4 violates the second age rule
<=120

## All Rules can be maintained on an external Text File


E_file = editfile("EDA_Rules_Edit.txt")

rule_violations = violatedEdits(E_file, people)

rule_violations

plot(rule_violations)

## Graph showing interconnection between Variables and Restrictions


plot(E_file)

## OBSERVATIONS:
## - Two cases of Categorical violations involving Group and Status
## - If status == 'married', group should be 'adult' or
'elderly'
## - Rule violated in records 2 and 5
## - Two cases of Mixed Rules violations involving Status,
YearsMarried and Age
## - If status == 'married', age - yrsMarried >= 17
## - Rule violated in records 2 and 3

##
======================================================================
==============
## WORKING WITH DIFFERENT UNITS
##
======================================================================
==============

name = c("A","B","C","D","E")
height = c(170.00,1.74, 70.00, 168.00, 5.91)
unit = c("cm","m","inch","cm","ft")

physical = data.frame(name,height,unit)
physical

library(deducorrect)

## Convert all values into Meters ("m")


R = correctionRules("Length_Convertions.txt")
R

## Apply correction rules to data


cor = correctWithRules(R, physical)
cor

cor$corrected
## *****************************************
## Working with Dates - Also covered in Intro to R
## *****************************************

## In R, Dates and Times are captured using POSIXct (Continuous Time -


Number of seconds) and POSIXlt (List Time)
## Base Date: 1 January 1970

Sys.time()

class(Sys.time())

time.list = as.POSIXlt(Sys.time())
unlist(time.list)

y <- strptime("01/02/2018",format="%d/%m/%Y")
y

weekdays(y) ## Find day of the week

y$wday ## Thursday is fourth day of the week

## R is clever with dates!!


start_end_dates = c("2016 2 Mon", "2017 6 Fri", "2018 10 Tue") ## Mon
of Week 2, Fri of Week 6 and Tue of Week 10
strptime(start_end_dates, format = "%Y %W %a")

## Difference between two dates


difftime("2014-02-06", "2016-08-15")
as.numeric(difftime("2014-02-06", "2016-08-15"))

## Generating Sequence of dates from 2015-11-04 to 2015-11-15


incrementing by 1 day
dates.seq = seq(as.POSIXlt("2015-11-04"), as.POSIXlt("2015-11-15"), "1
day")
dates.seq
class(dates.seq)

dates.seq1 = seq(as.POSIXlt("2015-11-04"), by = "day", length = 11)


dates.seq1
class(dates.seq1)

## Working with Dates using Lubridate package

library(lubridate)

dates = c("15/12/2013", "15 December 13", "It happened on 15 02 '13")

dmy(dates) ## All dates above converted to common format!!

## How does R know whether it is 1913 or 2013??


## Years 00 to 68 will be 20xx
## Years 69 to 99 will be 19xx

## Other limitations
dmy("15 Feb 2018")

dmy("15 Febr 2018") ## Error since POSIX standard expects Feb and
not Febr

##
======================================================================
================
## CHARACTER MANIPULATION USING stringr PACKAGE
##
======================================================================
================

## CHARCTER MANIPULATION INCLUDES:


## - Remove pre-pending or trailing white spaces
## - Pad strings to certain width
## - Transform to upper/lower case
## - Search for strings containing certain patterns (substrings)
## - Approximate matching procedures based on string distances

library(stringr)

## Remove white spaces before and after text


str_trim(" hello world ")

## Remove white spaces left of text


str_trim(" hello world ", side = "left")

## Remove white spaces right of text


str_trim(" hello world ", side = "right")

## Add spaces before text


str_pad("hello world", width = 20, side = "left", pad = " ") # width
is total length including padding

## Add zeros before numbers


str_pad(112, width = 6, side = "left", pad = 0) # Padding numbers
for fields like IDs

## Convert string to ALL CAPS


toupper("hello world")

## Convert string to all lower


tolower('HELLO WORLD')

##
======================================================================
==================
## Approximate String Matching
##
======================================================================
==================

gender = c("M", "male","F", "Female", "fem.")

## Find all values with "m" in gender


grepl("m", gender) # Gives logical output

grep("m", gender) # Gives row or position number

## Ignore case "M" and "m" should be treated the same


grepl("m", gender, ignore.case = TRUE) # Gives logical output

grep("m", gender, ignore.case = TRUE) # Gives row or position number

## Look for any value that starts with "M" or "m"


grepl("^m", gender, ignore.case = TRUE) # Notice ^ before the seach
parameter

## Working with Special Characters


gender = c("M", "male","F", "Female", "fem.","Male**","F+","male/")

grepl("+", gender, fixed = TRUE) # Search for '+'


grepl(".", gender, fixed = TRUE) # Search for '.'
grepl("*", gender, fixed = TRUE) # Search for '*'

////////////////PCA///////////////////////////
library(nFactors)
attach(cs)
csi=cs[,c("Sales","CompPrice","Income","Advertising","Population","Pri
ce","Age","Education")]
csi
ev = eigen(cor(csi)) # get eigenvalues
ev
EigenValue=ev$values
EigenValue
Factor=c(1,2,3,4,5,6,7,8)
Scree=data.frame(Factor,EigenValue)
plot(Scree,main="Scree Plot", col="Blue",ylim=c(0,4))
lines(Scree,col="Red")
library(psych)
Unrotate=principal(csi, nfactors=3, rotate="none")
print(Unrotate,digits=3)
UnrotatedProfile=plot(Unrotate,row.names(Unrotate$loadings))
Rotate=principal(csi,nfactors=3,rotate="varimax")
print(Rotate,digits=3)
RotatedProfile=plot(Rotate,row.names(Rotate$loadings),cex=1.0)
///////////////Practice////////////
library(plot3D)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
x
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, clab = c("Sepal", "Width (cm)"))
scatter3D(x, y, z, bty = "f", colkey = FALSE, main ="bty= 'f'")
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
# User defined
scatter3D(x, y, z, pch = 18, bty = "u", colkey = FALSE,
main ="bty= 'u'", col.panel ="steelblue", expand =0.4,
col.grid = "darkblue")
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
library(ggplot2)
qplot(iris$sepal.length,iris$sepal.width, col="red",
fill="iris$sepal.width")
ggplot(iris, aes(x=sepal.length))+
geom_histogram(aes(y=iris$sepal.width), binwidth=5,
col="black",fill="red")
library(dplyr)
tbl_df(iris)
glimpse(iris)
View(iris)
iris %>%
group_by(variety) %>%
summarise(avg = mean(sepal.width)) %>%
arrange
library(tidyr)
gather(iris, "new", "n", 2:4)
slice(iris, 10:15)
summarise_each(iris, funs(mean))
count(iris, variety, wt = sepal.length)
summarise(iris, avg = mean(sepal.length))
group_by(iris, variety)
library(stringr)
str_detect(iris$variety, "z")
library(MASS)
data=Cars93
data
attach(data)
names(data)
ggplot(data, aes(x=Price))+
geom_bar(binwidth=5, col="red", fill="blue")
ggplot(data, aes(x=RPM))+
geom_histogram(aes(y=..density..), col="red", fill="Black")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x=Weight))+
geom_histogram(aes(y=..density..),binwidth = 2, colour = "black",
fill = "white")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +


geom_text(stat = "count")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +


geom_boxplot() +
guides(fill = FALSE)

ggplot(data, aes(x = Horsepower, y = MPG.city, colour = Cylinders)) +


geom_point()

ggplot(data, aes(x = Horsepower, y = MPG.city)) +


geom_point() +
facet_wrap( ~ Cylinders, ncol = 3)
View(Cars93)

df <- data_frame(x.to = c( 2, 3, 3, 2,-2,-3,-3,-2),


y.to = c( 3, 2,-2,-3,-3,-2, 2, 3),
x = 0,
y = 0,
x_gt_y = abs(x.to) > abs(y.to),
xy_sign = sign(x.to*y.to) == 1,
x_gt_y_equal_xy_sign = x_gt_y == xy_sign)
df

ggplot(df) +
geom_segment(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y, linetype = !xy_sign),
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal()

ggplot() +
geom_curve(data = df %>% filter(x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
geom_curve(data = df %>% filter(!x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature =-0.75, angle = 45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
ggplot(df) +
geom_curve(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)

df2 <- data.frame(supp=rep(c("VC", "OJ"), each=3),


dose=rep(c("D0.5", "D1", "D2"),2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))

head(df2)

ggplot(data=df2, aes(x=dose, y=len, group=supp)) +


geom_line(linetype="dashed", color="blue", size=1.2)+
geom_point()

data_summary <- function(data, varname, groupnames){


require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}

df3 <- data_summary(ToothGrowth, varname="len",


groupnames=c("supp", "dose"))
head(df3)

ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +


geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()

# Use position_dodge to move overlapped errorbars horizontally


ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +
geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1,
position=position_dodge(0.05)) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()
View(ToothGrowth)

long_DF <- DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)


head(long_DF, 24)

DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)


DF %>% gather(Quarter, Revenue, -Group, -Year)
DF %>% gather(Quarter, Revenue, 3:6)
DF %>% gather(Quarter, Revenue, Qtr.1, Qtr.2, Qtr.3, Qtr.4)

square.it=function(x)
{
square=x*x
return(Square)
}
hist(c(3, 5, 10, 10, 11, 12, 12, 14, 14, 14, 19))
pnorm(100,0.03)

#Kaggle DS LR
train=read.csv("D:/sahubackup/GL/LR/Kaggle/train.csv")
train
test=read.csv("D:/sahubackup/GL/LR/Kaggle/test.csv")
test
library(ggplot2)
numberofNAs=length(which(is.na(train)==T))
if(numberofNAs>0)
{
cat('Number of missing values found: ', numberofNAs)
cat('\nRemoving missing values...')
train = train[complete.cases(train), ]
}
#par(mfrow=c(2,1), size=)
boxplot(train$x)
boxplot(test$x)
boxplot(train$x, main='X', sub=paste('Outliers: ',
boxplot.stats(train$x)$out))
boxplot(test$x, main='X', sub=paste('Outliers: ',
boxplot.stats(test$x)$out))

av=read.csv("D:/sahubackup/GL/LR/av.csv")
av

levels(av$group)
acgrp=ordered(av$group, levels=c("ctrl","trt1","trt2"))
acgrp
library(dplyr)
grp=group_by(av,group) %>%
summarise(count=n(),
mean=mean(weight, na.rm = TRUE,
sd=sd(weight,na.rm = TRUE)))
grp=av$group
grp
factor.grp=factor(grp)
factor.grp
attach(av)
grp = c("Mon","Fri","Mon","Wed","Wed","Sat")
factor.wday = factor(wday)
factor.wday

library(ggplot2)
ggplot(av, aes(x=group, y=weight, fill=group))+
geom_boxplot(order = c("ctrl", "trt1", "trt2"))+
theme_classic() +
theme(legend.position = "none")
boxplot(av)
boxplot(weight~group)
av
avgrp
names(av)
ggplot(av, aes(x = group, y = weight)) + ## Simple Box Plot -
Midsize has high variance
geom_boxplot()

res.aov <- aov(weight ~ group, data = av)


res.aov
summary(res.aov)

TukeyHSD(res.aov)
library(multcomp)
summary(glht(res.aov, linfct = mcp(group = "Tukey")))
pairwise.t.test(av$weight, av$group,
p.adjust.method = "BH")
library(car)
leveneTest(weight ~ group, data = av)

#two way anova


tanv=read.csv("D:/sahubackup/GL/ml-latest-small/tav.csv")
tanv
attach(tanv)
unique(dose)
table(supp,dose)

ggplot(tanv, aes(x = dose, y = len,color=supp)) + ## Simple Box


Plot - Midsize has high variance
geom_boxplot()

ggplot(tanv, aes(x = dose, y = len,color=supp))+


geom_boxplot(len ~ supp * dose)

res.aov2 <- aov(len ~ supp + dose, data = tanv)


summary(res.aov2)
res.aov3 <- aov(len ~ supp * dose, data = tanv)
res.aov3 <- aov(len ~ supp + dose + supp:dose, data = tanv)
summary(res.aov3)
models <- regsubsets(Fertility~.,
really.big=TRUE,method="exhaustive",data = swissdata, nvmax = 5)
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)

get_model_formula <- function(id, object, outcome){

models <- summary(object)$which[id,-1]

predictors <- names(which(models == TRUE))


predictors <- paste(predictors, collapse = "+")

as.formula(paste0(outcome, "~", predictors))


}
model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models, "Fertility")
%>%
map(get_cv_error, data = swissdata) %>%
unlist()
cv.errors
Price=Factor(PL,levels=c(1,2,3), labels=c(high,med,low))
interraction.plot(Price, Advertisement, Sales, col=c(r,y,b))
library(MASS)
library(reshape2)
data=data("Boston", package="MASS")
data
View(Boston)
pairs(Boston)
bosmelt <- melt(Boston, id="crim")
ggplot(bosmelt, aes(x=value, y=crim))+
facet_wrap(~variable, scales="free")+
geom_point()

confint(SLM, "PerOcc")

//////////////////Practice1/////////////////
install.packages("ggplot2")
library(lattice)
t.test(30, mu=0.29)

seandsd <- function(x){


seresult <- semean(x)
sdresult <- sd(x)
# Store results in a vector with names
vec <- c(seresult, sdresult)
names(vec) <- c("SE","SD")
return(vec)
}
x <- rnorm(100, mean=20, sd=4)
x
seandsd(x)

mylist <- list(a=1:10, txt=c("hello","world"),


dfr=data.frame(x=c(2,3,4),y=c(5,6,7)))
mylist

install.packages("olsrr")
pnorm(0.8,100,10,1)

df = data.frame(group=c("Group 1","Group 1","Group 2","Group 2","Group


2"), subgroup =
c("A","A","A","A","B"),value = c(2,2.5,1,2,1.5))
df
sum1=aggregate(value~group,FUN=sum,data=df)
sum1
library(dplyr)
df %>% group_by(group) %>% summarise(value=sum(value)) %>%
as.data.frame()
df %>% group_by(group) %>% summarize(value = mean(value)) %>%
as.data.frame()
df %>% group_by(group) %>% summarize(value = sum(value[value>2])) %>%
as.data.frame()
install.packages("devtools")
xmat <- cbind(rnorm(100, -3), rnorm(100, -1), rnorm(100, 1),
rnorm(100, 3))
xmat
plot(xmat[,1], type='l')
lines(xmat[,2], col="red")
lines(xmat[,3], col="green")
lines(xmat[,4], col="blue")
matplot(xmat, type='l')

nterms=as.integer(readline(prompt="how many numbers="))


n1=0
n2=1
count=2
if(is.na(nterms))
{
print("enter a positive num=")
}else{
if(nterms==1){print(n1)
}else{
print(n1)
print(n2)
while(count<nterms)
{
nth=n1+n2
print(nth)
n1=n2
n2=nth
count=count+1
}
}
}

recurse_fibonacci <- function(n) {


if (n <= 1) {
return(n)
} else {
return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
}
}

nterms = as.integer(readline(prompt="How many terms? "))

if(is.na(nterms)) {
print("nter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}

Fibonacci <- numeric(10)


Fibonacci[1] <- Fibonacci[2] <- 1
for (i in 3:10) Fibonacci[i] <- Fibonacci[i - 2] + Fibonacci[i - 1]
print("First 10 Fibonacci numbers:")
print(Fibonacci)

fibb <- function (n) {


if (n < 3) {
return(c(0,1)[n])
} else {
return(fibb(n - 2) + fibb(n -1))
}
}
fibb(5)

ap=read.csv("D:/sahubackup/GL/AirPassengers.csv")
start=head(ap,1)
start
end=tail(ap,1)
end
library(tseries)
library(forecast)

boxplot(dd~cycle(dd),xlab="Date", ylab = "Passenger Numbers (1000's)"


,main ="Monthly Air Passengers Boxplot from 1949 to 1961")
library(stringr)
library(rvest)
library(xml2)
url="https://www.dezyre.com/data-science-in-r-programming-tutorial/r-
tutorial-importing-data-from-web"
t_link=read_html(url)
t_link
transcript=t_link %>%html_nodes("#main-content") %>% html_text()
markers=str_locate_all(transcript,pattern="R|JSON")
transcript
head(markers)
production_data = readHTMLTable(url, which=2)

library(XML)

u = "http://en.wikipedia.org/wiki/World_population"

tables = readHTMLTable(u)
names(tables)

tables[[2]]

tmp = tables[[2]]

genres2 <- as.data.frame(tstrsplit(genres, '[|]',


type.convert=TRUE),stringsAsFactors=FALSE)

for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
genre_matrix2 <- as.data.frame(genre_matrix[-1,],
stringsAsFactors=FALSE) #remove first row, which was the genre list
for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
}

search_matrix <- cbind(movie[,1], substr(movie[,2],1,nchar(movie[,2])-


6), years, genre_matrix2)
colnames(search_matrix) <- c("movieId", "title", "year", genre_list)

for(i in 1:5)
{
for(j in 1:2)
{
print(i*j);
}
}
movies=read.csv("D:/sahubackup/GL/ml-latest-small/movies.csv")
movies
df_genres=data.frame(movies[,3])
df_genres
i=nrow(movies$title)
i
str(movies)
nr=nrow(movies)
nr
mat_mov_gen=data.matrix(movies,rownames.force = NA)
mat_mov_gen
genres <- as.data.frame(movies$genres, stringsAsFactors=FALSE)
genres
library(data.table)
genres2 <- as.data.frame(tstrsplit(genres[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE)
genres2
colnames(genres2) <- c(1:10)
colnames(genres2)
genre_list <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western")
genre_matrix <- matrix(0,9743,18)
genre_matrix
genre_matrix[1,] <- genre_list
genre_matrix[1,]
colnames(genre_matrix) <- genre_list
colnames(genre_matrix)
for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
genre_matrix2 <- as.data.frame(genre_matrix[-1,],
stringsAsFactors=FALSE)
genre_matrix2

for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
}
years <- as.data.frame(movies$title, stringsAsFactors=FALSE)
library(data.table)
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))}
yt=movies$title
yt
class(yt)
ytc=as.character(yt)
ytc
class(ytc)
years <- as.data.frame(substr(substrRight(substrRight(ytc, 6),5),1,4))
years
search_matrix <- cbind(movies[,1], substr(movies[,2],1,nchar(ytc)-6),
years, genre_matrix2)
search_matrix
colnames(search_matrix) <- c("movieId", "title", "year", genre_list)
colnames(search_matrix)
write.csv(search_matrix, "search.csv")
search_matrix <- read.csv("search.csv", stringsAsFactors=FALSE)
search_matrix
subset(search_matrix, Action == 1 & year == 1995)$title
ratings=read.csv("D:/sahubackup/GL/ml-latest-small/ratings.csv")
links=read.csv("D:/sahubackup/GL/ml-latest-small/links.csv")
tags=read.csv("D:/sahubackup/GL/ml-latest-small/tags.csv")
binaryratings <- ratings
binaryratings

for (i in 1:nrow(binaryratings)){
if (binaryratings[i,3] > 3){
binaryratings[i,3] <- 1
}
else{
binaryratings[i,3] <- -1
}
}

binaryratings2 <- dcast(binaryratings, movieId~userId, value.var =


"rating", na.rm=FALSE)
binaryratings2

for (i in 1:ncol(binaryratings2)){
binaryratings2[which(is.na(binaryratings2[,i]) == TRUE),i] <- 0
}
binaryratings2 = binaryratings2[,-1]
binaryratings2

#Remove rows that are not rated from movies dataset


movieIds <- length(unique(movies$movieId))
movieIds
ratingmovieIds <- length(unique(ratings$movieId))
ratingmovieIds #10325
movies2 <- movies[-which((movies$movieId %in% ratings$movieId) ==
FALSE),]
movies2
rownames(movies2) <- NULL
rownames(movies2)

#Remove rows that are not rated from genre_matrix2


genre_matrix3 <- genre_matrix2[-which((movies$movieId %in%
ratings$movieId) == FALSE),]
rownames(genre_matrix3) <- NULL
#Calculate dot product for User Profiles
result = matrix(0,18,668) # here, 668=no of users/raters, 18=no of
genres
for (c in 1:ncol(binaryratings2)){
for (i in 1:ncol(genre_matrix3)){
result[i,c] <- sum((genre_matrix3[,i]) * (binaryratings2[,c]))
#ratings per genre
}
}

for (c in 1:ncol(result)){
for (i in 1:nrow(result)){
if (result[i,c] < 0){
result[i,c] <- 0
}
else {
result[i,c] <- 1
}
}
}

library(reshape2)
#Create ratings matrix. Rows = userId, Columns = movieId
ratingmat <- dcast(ratings, userId~movieId, value.var = "rating",
na.rm=FALSE)
ratingmat <- as.matrix(ratingmat[,-1]) #remove userIds
library(recommenderlab)
install.packages("registry")
library(registry)
ratingmat <- as(ratingmat, "realRatingMatrix")

# Determine how similar the first four users are with each other
# create similarity matrix
similarity_users <- similarity(ratingmat[1:4, ],
method = "cosine",
which = "users")
as.matrix(similarity_users)
image(as.matrix(similarity_users), main = "User similarity")
# compute similarity between
# the first four movies
similarity_items <- similarity(ratingmat[, 1:4], method =
"cosine", which = "items")
as.matrix(similarity_items)
image(as.matrix(similarity_items), main = "Item similarity")

# Exploring values of ratings:


vector_ratings <- as.vector(ratingmat@data)
unique(vector_ratings) # what are unique values of ratings

table_ratings <- table(vector_ratings) # what is the count of each


rating value
table_ratings

library(ggplot2)
views_per_movie <- colCounts(ratingmat) # count views for each movie

table_views <- data.frame(movie = names(views_per_movie),


views = views_per_movie) # create dataframe
of views
table_views <- table_views[order(table_views$views,
decreasing = TRUE), ] # sort by
number of views

ggplot(table_views[1:6, ], aes(x = movie, y = views)) +


geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_discrete(labels=subset(movies2, movies2$movieId ==
table_views$movie)$title) +
ggtitle("Number of views of the top movies")

#Visualizing the matrix:


image(ratingmat, main = "Heatmap of the rating matrix") # hard to
read-too many dimensions
image(ratingmat[1:10, 1:15], main = "Heatmap of the first rows and
columns")
image(ratingmat[rowCounts(ratingmat) > quantile(rowCounts(ratingmat),
0.99),
colCounts(ratingmat) > quantile(colCounts(ratingmat),
0.99)],
main = "Heatmap of the top users and movies")

#Normalize the data


ratingmat_norm <- normalize(ratingmat)
image(ratingmat_norm[rowCounts(ratingmat_norm) >
quantile(rowCounts(ratingmat_norm), 0.99),
colCounts(ratingmat_norm) >
quantile(colCounts(ratingmat_norm), 0.99)],
main = "Heatmap of the top users and movies")

#Create UBFC Recommender Model. UBCF stands for User-Based


Collaborative Filtering
recommender_model <- Recommender(ratingmat_norm,
method = "UBCF",
param=list(method="Cosine",nn=30))

model_details <- getModel(recommender_model)


model_details$data

recom <- predict(recommender_model,


ratingmat[1],
n=10) #Obtain top 10 recommendations for 1st user in
dataset
recom

#recc_matrix <- sapply(recom@items,


# function(x){ colnames(ratingmat)[x] })
#dim(recc_matrix)

recom_list <- as(recom,


"list") #convert recommenderlab object to readable
list

#Obtain recommendations
recom_result <- matrix(0,10)
for (i in 1:10){
recom_result[i] <- as.character(subset(movies,
movies$movieId ==
as.integer(recom_list[[1]][i]))$title)
}

# Evaluation:
evaluation_scheme <- evaluationScheme(ratingmat,
method="cross-validation",
k=5, given=3,
goodRating=5) #k=5 meaning a 5-
fold cross validation. given=3 meaning a Given-3 protocol
evaluation_results <- evaluate(evaluation_scheme,
method="UBCF",
n=c(1,3,5,10,15,20))
eval_results <- getConfusionMatrix(evaluation_results)[[1]]
eval_results

##star triangle
for(i in 1:5)
{
for(j in 1:2)
{
print("*");
}
}

pascalTriangle <- function(h) {


for(i in 0:(h-1)) {
s <- ""
for(k in 0:(h-i)) s <- paste(s, " ", sep="")
for(j in 0:i) {
s <- paste(s, sprintf("%3d ", choose(i, j)), sep="")
}
print(s)
}
}
pascalTriangle(5)

ap=read.csv("D:/sahubackup/GL/AirPassengers.csv")
start=head(ap,1)
start
end=tail(ap,1)
end
frequency(ap)
findfrequency(ap)
class(ap)
tsap=ts(ap,start=c(1949,1),end=c(1960,12),frequency=365)
tsap
class(tsap)
findfrequency(tsap)
frequency(tsap)
plot(ap)
train <- ap[,1:132]
dim(ap)
library(caTools)
set.seed(123)
split=sample.split(ap,SplitRatio=0.8)
train=subset(ap,split==T)
test=subset(ap,split==F)
train
test
train_ts=ts(train, start=1949, frequency=12)
test_ts=ts(test, start=1960, frequency=12)
train_ts
test_ts
library(ggplot2) #Data Visualisation
library(ggfortify) #Data Visualisation
library(forecast)
decomposedres <- decompose(ap)
plot(decomposedres)
mean_baseline <- meanf(train_ts, h=12)
plot(mean_baseline, type="l")
lines(ap)
accuracy(mean_baseline, test)
sma <- ma(train_ts, order=12)
plot(sma, xlim=c(1949, 1960), ylim=c(0, 600), col="red")
lines(train)
pnorm(20,0.1,1)
pnorm(0.997,0.996,0.0033)
pnorm(15,0.6,15)
pnorm(40,65.16,10)-pnorm(50,65.16,10)
qnorm(0.99, mean = 65.16, sd=10)
pnorm(0.998,0.9563,0.0189)-pnorm(0.997,0.9563,0.0189)

1-dpois(0,lambda=3)
dpois(2,lambda=3)+dpois(3,lambda=3)+dpois(4,lambda=3)
dpois(6,lambda=4)
dpois(1,0.15)+dpois(0,0.15)
dpois(10,lambda=10)
pnorm(20.08,20.05,0.02,100)-pnorm(20.03,20.05,0.02,100)
pnorm(20.01,20.05,0.02,100)
pnorm(3.69,3.25,0.6)-pnorm(2.75,3.25,0.6)
dbinom(0,3,1/6)
dbinom(6,9,0.6)
dbinom(6,10,0.45)
pnorm(178000, 168000, 6324.55) - pnorm(158000, 168000, 6324.55)

ap=read.csv("D:/sahubackup/GL/Food Nutrition.csv")
library(ggplot2)
#par(margin(5,5,1,5),cex.lab=1.2, cex.axis=0.9)

par(mfrow=c(1,3))
plot(ap$Protein_.g.,col="blue", fill="red" )
barplot(ap$Carbohydrt_.g.,col="red")
pie(ap$Water_.g., main="Piechart", radius=1)

sub_data=ap[which(ap$Water_.g.>=30.0),]
sub_data
library(dplyr)
by_shr=group_by(ap,ap$Shrt_Desc)
by_shr
attach(ap)
sub_data1=subset(ap, Water_.g.>30.0)
sub_data1

sub_data1_ord=arrange(sub_data1,desc(Water_.g.))
sub_data1_ord

sub_data1_fil=filter(sub_data1, Protein_.g.>30,Lipid_Tot_.g.>26 )
sub_data1_fil

par(mar=c(5,5,2,5), cex.lab=1.2, cex.axis=0.9)


with(sub_data1, plot(Protein_.g., Water_.g., type='p',
ylim=c(0,20), lwd=2, col="blue",
xlab="Protein_.g",
ylab="Water_.g."))
boxplot(sub_data1$Water_.g.)
boxplot(sub_data1$Protein_.g.)
scatter.smooth(sub_data1$Protein_.g.,sub_data1$Water_.g.)

add = function(a,b)
{
a+b
}
add(3,4)

for(i in 1:5)
{
for(j in 1:2)
{
print(i*j);
}
}

f_c=function(f)
{
(9/5)*(f+32)
}
f_c(32)

for(i in 1:2)
{
for(j in 101:110)
{
print(i+j)
}
}

sq=function(a,b)
{
a*a+b*b
}
sq(3,4)

recurse_fibonacci <- function(n) {


if (n <= 1) {
return(n)
} else {
return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
}
}

nterms = as.integer(readline(prompt="How many terms? "))

if(is.na(nterms)) {
print("nter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}
3

Fibonacci <- numeric(10)


Fibonacci[1] <- Fibonacci[2] <- 1
for (i in 3:10) Fibonacci[i] <- Fibonacci[i - 2] + Fibonacci[i - 1]
print("First 10 Fibonacci numbers:")
print(Fibonacci)

fibb <- function (n) {


if (n < 3) {
return(c(0,1)[n])
} else {
return(fibb(n - 2) + fibb(n -1))
}
}
fibb(5)

nterms=as.integer(readline(prompt="how many numbers="))


n1=0
n2=1
count=2
if(is.na(nterms))
{
print("enter a positive num=")
}else{
if(nterms==1){print(n1)
}else{
print(n1)
print(n2)
while(count<nterms)
{
nth=n1+n2
print(nth)
n1=n2
n2=nth
count=count+1
}
}
}
s=apply(sub_data1[,3:5],2,mean)
s

s1=lapply(sub_data1[,3:5],mean)
s1

s2=sapply(sub_data1[,3:5],mean)
s2

s3 = c("This", "is", "a", "test", "for", "sapply", "function")


sapply(s3, nchar)

tapply(sub_data1$Water_.g.,sub_data1$Shrt_Desc, mean)
install.packages("plot3D")
library(plot3D)
detach(ap)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, ..., colvar = z, col = NULL, add = FALSE)
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)

pnorm(44,40,3,lower.tail = FALSE)
qnorm(0.9087,40,3)

qnorm(0.025,0,1)
qnorm(0.975)
qnorm(0.025)
qnorm(0.95,0,1)
qnorm(0.95,0,1,lower.tail = FALSE)
qnorm(0.005,0,1)
qnorm(0.01,0,1,lower.tail = FALSE)
qnorm(0.01,0,1)
qnorm(0.95,0,1,lower.tail = FALSE)
pnorm(-1.25,0,1)
pnorm(2.5,0,1,lower.tail = FALSE)
pnorm(3.16,lower.tail = FALSE)

nterms=as.integer(readline(prompt="how many numbers="))


n1=0
n2=1
count=2
if(is.na(nterms))
{
print("enter a positive num=")
}else{
if(nterms==1){print(n1)
}else{
print(n1)
print(n2)
while(count<nterms)
{
nth=n1+n2
print(nth)
n1=n2
n2=nth
count=count+1
}
}
}

nterms=as.integer(readline(prompt="how many numbers="))


n1=0
n2=1
count=2
if(is.na(nterms))
{
print("enter a positive num=")
}else{
if(nterms==1){print(n1)
}else{
print(n1)
print(n2)
if(count==nterms)
{
nth=n1+n2
print(nth)
n1=n2
n2=nth
count=count+1
}
}
}

len <- 10
fibvals <- numeric(len)
fibvals[1] <- 1
fibvals[2] <- 1
for (i in 3:len) {
fibvals[i] <- fibvals[i-1]+fibvals[i-2]
}
/

nterms=(readline(prompt="how many numbers="))


len=nterms
fibvals <- numeric(len)
fibvals[1] <- 1
fibvals[2] <- 1
for (i in 3:len) {
fibvals[i] <- fibvals[i-1]+fibvals[i-2]
}
print(fibvals)

////////////SmartEDA/////////////
nstall.packages("ISLR")
library("ISLR")
install.packages("SmartEDA")
library("SmartEDA")
## Load sample dataset from ISLR pacakge
Carseats= ISLR::Carseats
Overview of the data - Type = 1
ExpData(data=Carseats,type=1)

# Structure of the data - Type = 2


ExpData(data=Carseats,type=2)
ExpNumStat(Carseats,by="A",gp=NULL,Qnt=seq(0,1,0.1),MesofShape=2,Outlier=TRUE
,round=2,Nlim=10)
plot1 <- ExpNumViz(Carseats,target=NULL,nlim=10,Page=c(2,2),sample=8,theme=)
plot1[[1]]
ExpCTable(Carseats,Target=NULL,margin=1,clim=10,nlim=NULL,round=2,bin=NULL,pe
r=T)
plot2 <- ExpCatViz(Carseats,target=NULL,col
="slateblue4",clim=10,margin=2,Page = c(2,1),sample=4)
plot2[[1]]
summary(Carseats[,"Price"])
ExpNumStat(Carseats,by="A",gp="Price",Qnt=seq(0,1,0.1),MesofShape=1,Outlier=T
RUE,round=2)
#Note: sample=8 means randomly selected 8 scatter plots
#Note: nlim=4 means included numeric variable with unique value is more than
4
plot3 <-
ExpNumViz(Carseats,target="Price",nlim=4,scatter=FALSE,fname=NULL,col="green"
,Page=c(2,2),sample=8)
plot3[[1]]
#Note: sample=8 means randomly selected 8 scatter plots
#Note: nlim=4 means included numeric variable with unique value is more than
4
plot31 <-
ExpNumViz(Carseats,target="US",nlim=4,scatter=TRUE,fname=NULL,Page=c(2,1),sam
ple=4)
plot31[[1]]
##bin=4, descretized 4 categories based on quantiles
ExpCTable(Carseats,Target="Price",margin=1,clim=10,nlim=NULL,round=2,bin=4,pe
r=F)
ExpNumStat(Carseats,by="GA",gp="Urban",Qnt=seq(0,1,0.1),MesofShape=2,Outlier=
TRUE,round=2)
plot4 <-
ExpNumViz(Carseats,target="Urban",type=1,nlim=NULL,fname=NULL,col=c("darkgree
n","springgreen3","springgreen1"),Page=c(2,2),sample=8)
plot4[[1]]
ExpCTable(Carseats,Target="Urban",margin=1,clim=10,nlim=NULL,round=2,bin=NULL
,per=F)
ExpCatStat(Carseats,Target="Urban",result =
"IV",clim=10,nlim=5,bins=10,Pclass="Yes",plot=FALSE,top=20,Round=2)
et4 <- ExpCatStat(Carseats,Target="Urban",result =
"Stat",clim=10,nlim=5,bins=10,Pclass="Yes",plot=FALSE,top=20,Round=2)
varimp <- ExpCatStat(Carseats,Target="Urban",result =
"Stat",clim=10,nlim=5,bins=10,Pclass="Yes",plot=TRUE,top=10,Round=2)
plot5 <-
ExpCatViz(Carseats,target="Urban",fname=NULL,clim=5,col=c("slateblue4","slate
blue1"),margin=2,Page = c(2,1),sample=2)
plot5[[1]]
options(width = 150)
CData = ISLR::Carseats
qqp <- ExpOutQQ(CData,nlim=10,fname=NULL,Page=c(2,2),sample=4)
qqp[[1]]
ExpParcoord(CData,Group=NULL,Stsize=NULL,Nvar=c("Price","Income","Advertising
","Population","Age","Education"))
ExpParcoord(CData,Group="ShelveLoc",Stsize=c(10,15,20),Nvar=c("Price","Income
"),Cvar=c("Urban","US"))
ExpParcoord(CData,Group="ShelveLoc",Nvar=c("Price","Income"),Cvar=c("Urban","
US"),scale=NULL)
ExpParcoord(CData,Group="US",Nvar=c("Price","Income"),Cvar=c("ShelveLoc"),sca
le="std")
ExpParcoord(CData,Group="ShelveLoc",Stsize=c(10,15,20),Nvar=c("Price","Income
","Advertising","Population","Age","Education"))
ExpParcoord(CData,Group="US",Stsize=c(15,50),Cvar=c("ShelveLoc","Urban"))
ExpCustomStat(Carseats,Cvar="Urban",Nvar=c("Age","Price"),stat=c("mean","coun
t"),gpby=TRUE,dcast=F)
ExpCustomStat(Carseats,Cvar="Urban",Nvar=c("Age","Price"),stat=c("mean","coun
t"),gpby=TRUE,dcast=T)
ExpCustomStat(Carseats,Cvar=c("Urban","ShelveLoc"),Nvar=c("Age","Price","Adve
rtising","Sales"),stat=c("mean"),gpby=FALSE,dcast=T)

 grid.arrange() and arrangeGrob() to arrange multiple ggplots on one page


 marrangeGrob() for arranging multiple ggplots over multiple pages.

You might also like