0% found this document useful (0 votes)

300 views56 pages

R Note

The document reads in CSV data on insurance claims, selects specific features for analysis, cleans the data, generates summary statistics, and performs additional analysis including grouping ages into buckets and generating frequency tables. It then saves the output to a CSV file and creates various plots of the data including boxplots and scatter plots of features.

Uploaded by

Suchismita Sahu

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

300 views56 pages

R Note

Uploaded by

Suchismita Sahu

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 56

# Read data from CSV file

claim_data <- read.csv("D:/sahubackup/GL/Car-2.csv")

## Look at the first few rows.

head(claim_data)

## Choose 5 features only to work on.

work_data <- claim_data[,c("AGE", "MSTATUS", "SEX", "EDUCATION",
"RED_CAR")]

## Look at the structure

str(work_data)

## Look at the attributes of work_data$MSTATUS

attributes(work_data$MSTATUS)

## Correct the levels of MSTATUS

levels(work_data$MSTATUS) <- c("Yes", "No")

## Correct the levels using function from forcats package

work_data$SEX <- fct_collapse(work_data$SEX, F = c("z_F"))
work_data$SEX
## Now the structure of data is correct.
## Lets look at the summary statistics
summary(work_data)

## Lets find out how many people with age less than 18 are filing for
claims
work_data[ which(work_data$AGE < 18), ]
work_data
## Lets do some more analysis
## Group the age into buckets
## Add a new variable agegroup with these buckets
work_data$agegroup <- cut(work_data$AGE,
breaks = c(0,35,50, 100),
labels = c("less than 35", "35 to 50", "more
than 50"))
work_data$agegroup
## Now see the summary data with new field added
summary(work_data)

## Generate the frequency tables of RED_CAR and MSTATUS for agegroup

red_car_stats <- table(work_data$agegroup, work_data$RED_CAR)
red_car_stats
mstatus_stats <- table(work_data$agegroup, work_data$MSTATUS)
mstatus_stats
total_cars <- table(work_data$agegroup)
total_cars
## Combine the RED_CAR and MSTATUS into a dataframe
output <- cbind(total_cars, red_car_stats[,2], mstatus_stats[,1])
output <- data.frame(output)

## Check the attribute of the output variable

attributes(output)

## Update the column names of the features

colnames(output) <- c("Total_Cars", "Red_Cars", "Marital_Status")

## Print the output

output

output$red_car_percent <- (output$Red_Cars/output$Total_Cars * 100)

output$red_car_percent <- round(output$red_car_percent, 2)

## Print the output

output

## We can save the output as CSV

write.csv(output, "output.csv")

## boxplot for one variable (work_data$AGE)

## It specifies the outliers
boxplot(work_data$AGE)

## Plot more than one variables

plot(work_data$SEX, work_data$AGE)
plot(work_data$EDUCATION, work_data$AGE)

## Example of ggplot
ggplot(data = work_data, mapping = aes(x = SEX, y = AGE)) +
geom_boxplot( aes(colour = EDUCATION), outlier.colour = "red")

## Uber data analysis

uber=read.csv("D:/sahubackup/GL/Uber Dataset.csv")
uber
dim(uber)
anyNA(uber)
sum(is.na(uber))
sapply(uber, function(x) sum(is.na(x)))
uber$borough = as.factor(replace(as.character(uber$borough),
is.na(uber$borough),"Unknown"))
table(uber$borough)
installed.packages("lubridate")
library(lubridate)
uber$start_date = strptime(uber$pickup_dt,'%Y-%m-%d %H:%M')
uber$start_date
uber$start_month = month(uber$start_date)
uber$start_month
uber$start_day = day(uber$start_date)
uber$start_hour = hour(uber$start_date)
uber$wday = weekdays(uber$start_date)
uber = uber[,-14]
uber
attach(uber)
detach(uber)
unique(uber[which(uber$hday=="Y"),c("start_day","start_month")])
count(holiday)
table(uber$hday,uber$start_month)
names(uber)
## Uni-Variate Analysis
boxplot(uber$spd)
hist(uber$spd)
unique(uber,by=c('start_month', 'start_day'))
plot(aggregate(pickups~hday,data=uber, mean), type="b")

## Bi-Variate analysis
corrplot(cor(uber[,4:12]))
plot(uber$spd, uber$pickups, xlab= "speed", ylab="pickup", main
="pickup vs speed")
abline(lm(uber$pickups~uber$spd))
plot(aggregate(pickups~start_month,data=uber, sum), type="b")
uber %>%
filter(.,start_month !=2) %>%
ggplot(aes(x=start_day, y=pickups))+geom_bar(stat='identity')
plot(aggregate(pickups~start_hour,data=uber, sum), type="b")
ggplot(aes(x = reorder(wday, pickups), y = pickups), data = uber) +
geom_bar(aes(fill=pickups), width=0.5, stat = "identity") +
coord_flip()
ggplot(uber, aes(start_hour, pickups)) +
geom_jitter(alpha = 0.3, aes(colour = borough)) +
geom_smooth(aes(color = borough))
ggplot(uber, aes(start_hour, borough)) +
geom_jitter( alpha = 0.4, aes(color = pcp24 > 0)) +
geom_smooth(aes(color = pcp24 > 0))

## Coffee
install.packages("reshape", type="source")
install.packages("reshape2", type="source")
library(reshape)
library(reshape2)

library(reshape)
library(reshape2)
library(ggplot2)
library(plyr)
library(grid)
install.packages("gridExtra")
library(gridExtra)

coffee=read.csv("D:/sahubackup/GL/Coffee-1.csv")
coffee
dim(coffee)
attach(coffee)
histogram(Days_between_Purchase)

count_Brand<-count(coffee$Brand)
count_Brand
data_num <- as.data.frame(apply(coffee, 2, as.numeric))
data_num
ggplot(count_Brand,aes(Brand,Count))+geom_bar(stat = "Identity")

corcoffee=cor(coffee)
corcoffee
corrplot(corcoffee)
cor(Price_per_Packet,Income)
library(ggplot2)
ggplot(coffee, aes(x = Days_between_Purchase)) + geom_density()

par(mfrow=c(3,2))
barplot(brand$freq,names.arg=brand$x,main="BRAND")
barplot(edu$freq,names.arg=edu$x,main="Education")

cbind(count(coffee$sec),(count(coffee$sec))/sum(count(coffee$sec)$freq
))

cast(count(coffee[,c("sec","price_per_packet")], c("sec",
"price_per_packet"))
,sec~ price_per_packet)
coffee_new<-subset(coffee,Days_between_Purchase>1)
summary(coffee_new$Days_between_Purchase)
ggplot(coffee_new,aes(Brand,Days_between_Purchase))+geom_boxplot()

cost<-coffee[,c("Brand","Price_per_Packet")]
cost<-count(cost, c("Brand", "Price_per_Packet"))
cost<-cast(cost,Brand ~ Price_per_Packet)

par(mfrow=c(3,2))
barplot(Brand$freq,names.arg=Brand$x,main="BRAND")
barplot(Education$freq,names.arg=Education$x,main="Education")

qplot(coffee$Days_between_Purchase,data=coffee)

ggplot(coffee,aes(Brand,coffee$Days_between_Purchase))+geom_boxplot()

quantile(coffee$Days_between_Purchase)
q1<-6
q3<-17
iqr<-q3-q1
upper<-q3+(iqr*1.5)
upper

coffee_new<-subset(coffee,Days_between_Purchase<upper)
summary(coffee_new$Days_between_Purchase)

ggplot(coffee_new,aes(x=Brand,y=Days_between_Purchase))+geom_boxplot()

barplot(age$freq,names.arg=age$x,main="Age")
barplot(sec$freq,names.arg=sec$x,main="Social Economic Status")

barplot(income$freq,names.arg=income$x,main="Income")
barplot(price_cons$freq,names.arg=price_cons$x,main="Price Conscious")

/////////////HeartDisease/////////////
install.packages("car")
install.packages("tidyr")
install.packages("caret")
install.packages("broom")
install.packages("ROCR")
library(ggplot2)
library(car)
library(dplyr)
library(lattice)
library(tidyr)
library(caret)
library(MASS)
library(broom)
library(ROCR)

heart_data <- read.csv("D:/sahubackup/GL/heart.csv")

heart_data
str(heart_data)
head(heart_data)
names(heart_data)

heart_data$sex <- as.character(heart_data$sex)

heart_data$sex <- ifelse(heart_data$sex=="0",'female','male')
heart_data$sex

heart_data$cp <- factor(heart_data$cp)

heart_data$cp
heart_data$age
heart_data$chol
summary(heart_data)
attach(heart_data)
class(heart_data$cp)
class(heart_data$age)
ggplot(heart_data,
aes(x=heart_data$cp,y=heart_data$age))+geom_boxplot()
ggplot(heart_data,
aes(x=heart_data$cp,y=heart_data$chol))+geom_boxplot()
ggplot(heart_data, aes(x=cp,y=chol, fill=sex)) + geom_bar(stat =
"identity")
histogram(age, binwidth=5)
ggplot(heart_data, aes(x = age)) + geom_density(col="red")

ggplot(heart_data,aes(x = age)) + geom_histogram(bins =30,fill

="dodgerblue4") + theme_bw() + theme_classic() +ggtitle("age
distribution") +ylab("number of people")
ggplot(heart_data,aes(x = age)) + geom_density(fill ="dodgerblue4")
+ggtitle("age distribution") +ylab("number of people")
boxplot(heart_data$age,main ="boxplot of age for normality check",col
="dodgerblue4")
qqPlot(heart_data$age,main ="normality check for age",grid = F)
#sex

ggplot(heart_data,aes(x =sex)) + geom_bar(width = 0.2,fill ="green") +

geom_text(stat = 'count',aes(label =..count..),vjust =-0.5) +
theme_bw() + theme_classic() +ylab("number of count")

table(heart_data$chest_pain)
ggplot(heart_data,aes(x = cp)) +
geom_bar(width =0.2,fill ="red") +
geom_text(stat = 'count',aes(label =..count..),vjust = -0.5)
#rest_bp
class(trestbps)
ggplot(heart_data, aes(x=heart_data$trestbps,y=heart_data$chol,col =
"dodgerblue2",
main ="boxplot of
rest_bp",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data, aes(x=heart_data$cp,y=heart_data$trestbps,col =
"dodgerblue2",
main ="boxplot of
rest_bp",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data,aes(trestbps)) + geom_histogram(bins =20,fill
="green") +theme_bw() + theme_classic() +ggtitle("resp_bp")
ggplot(heart_data,aes(trestbps)) + geom_density(fill ="dodgerblue4") +
theme_bw() + theme_classic()+ggtitle("density plot of resp_bp")

#chol
ggplot(heart_data, aes(x=heart_data$chol,col = "dodgerblue2"
,main ="boxplot of
chol",col.main="dodgerblue4"))+
geom_boxplot()
ggplot(heart_data,aes(chol)) +
geom_histogram(bins =20,fill ="green") +
ggtitle("chol")
ggplot(heart_data,aes(chol)) +
geom_density(fill ="dodgerblue4") +
theme_bw() + theme_classic()+ggtitle("density plot of chol")
table(heart_data$fasting_bloodsugar)
ggplot(heart_data,aes(x =factor(fasting_bloodsugar))) + geom_bar(width
= 0.1,fill ="green") + geom_text(stat = 'count',aes(label
=..count..),vjust =-0.5) + theme_bw() + theme_classic() +ylab("number
of count") + ggtitle("blood sugar") + title.center
ggplot(heart_data,aes(factor(fasting_bloodsugar))) + geom_bar(width =
0.2,fill ="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
fasting_bloodsugar") +title.center

ggplot(heart_data,aes(factor(rest_ecg))) + geom_bar(width = 0.2,fill

="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
rest_ecg") +title.center

#max heart-rate
ggplot(heart_data,aes(max_heartrate)) + geom_histogram(fill =
"dodgerblue4",alpha =0.5) + theme_bw()+theme_classic()
ggplot(heart_data,aes(max_heartrate)) + geom_density(fill =
"red",alpha =0.5) + theme_bw()+theme_classic()
boxplot(heart_data$max_heartrate,col ="lightblue",notch = T,main
="boxplot of the maximum heart rate")

ggplot(heart_data,aes(factor(excercise_angina))) + geom_bar(width =
0.2,fill ="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
exercise angina")

ggplot(heart_data,aes(factor(slope))) + geom_bar(width = 0.2,fill

="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of
slope")

ggplot(heart_data,aes(factor(thal))) + geom_bar(width = 0.2,fill

="dodgerblue4") + theme_bw() + theme_classic()+geom_text(stat
='count',aes(label =..count..),vjust =-0.2)+ggtitle("barplot of thal")

##Correlation
heart_data$sex
class(heart_data$sex)
str(heart_data)
heart_data_wcp=subset(heart_data,select= -2)
heart_data_wcp
str(heart_data_wcp)
cor(heart_data_wcp)
install.packages("corrplot")
library(corrplot)
corl=cor(heart_data_wcp)
corl
corrplot(corl, method = "pie", type = "lower")
round(table(heart_data$target)/nrow(heart_data), digits = 2) ##So in
our dataset 54% has a heart disease while the rest does not.
##MR
library(caTools)
set.seed(123)
split = sample.split(heart_data$target, SplitRatio = 0.8)
training_set = subset(heart_data, split == TRUE)
training_set
test_set = subset(heart_data, split == FALSE)
test_set

regressor = lm(formula = target ~ .,

data = training_set)
summary(regressor)

regressor2=lm(formula = target ~ cp,

data = training_set)
summary(regressor2)

regressor2=lm(formula = target ~ ca+cp,

data = training_set)
summary(regressor2)

regressor3=lm(formula = target ~ cp,

data = training_set)
summary(regressor3)

# Predicting the Test set results

y_pred = predict(regressor3, newdata = test_set)
y_pred

library(ggplot2)
ggplot() +
geom_point(aes(x = training_set$cp, y = training_set$target),
colour = 'red') +
geom_line(aes(x = training_set$cp, y = predict(regressor3, newdata =
training_set)),
colour = 'blue') +
ggtitle('cp vs target(Training set)') +
xlab('cp') +
ylab('target')

library(ggplot2)
ggplot() +
geom_point(aes(x = test_set$cp, y = test_set$target),
colour = 'red') +
geom_line(aes(x = training_set$cp, y = predict(regressor3, newdata =
training_set)),
colour = 'blue') +
ggtitle('cp vs target(Test set)') +
xlab('cp') +
ylab('target')

# Fitting Random Forest Regression to the dataset

install.packages('randomForest')
library(randomForest)
set.seed(1234)
class(heart_data$target)
heart_data$target <- as.factor(heart_data$target)
class(heart_data$target)

sample.index <- sample(2, nrow(heart_data), replace = T, prob =

c(0.6,0.4))

heart.train_RF <- heart_data[sample.index == 1,]

heart.test_RF <- heart_data[sample.index == 2,]
heart.train_RF
heart.test_RF
regressor = randomForest(x = heart_data[2],
y = heart_data$target,
ntree = 10)
attributes <- names(heart_data)
attributes <- attributes[!attributes %in% c("target")]
attributes1 <- paste(attributes, collapse = "+") #saves the column
names separated by a plus sign
formula.rf <- as.formula(paste("target", attributes1, sep = " ~ "))

model.rf <- randomForest(formula.rf, heart.train_RF, ntree = 1000,

importance = TRUE)
plot(model.rf)

# Variable Importance Table

var.imp <- data.frame(importance(model.rf, type=2))
var.imp
# make row names as columns
var.imp$Variables <- row.names(var.imp)
var.imp[order(var.imp$IncNodePurity, decreasing = T),]

# Predicting a new result with Random Forest Regression

y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Random Forest Regression results (higher resolution)

library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary),
colour = 'red') +
geom_line(aes(x = x_grid, y = predict(regressor, newdata =
data.frame(Level = x_grid))),
colour = 'blue') +
ggtitle('Truth or Bluff (Random Forest Regression)') +
xlab('Level') +
ylab('Salary')

full.mod <- glm(target~.,data =training_set,family =binomial)

summary(full.mod)
#checking the model accuracy
prob <- full.mod %>% predict(test_set,type ="response")
predicted.class1 <- ifelse(prob>0.5,1,0)
mean(predicted.class1==test_set$target)
#accuracy =0.85
#stepwise logistic regression in R
step.model <- full.mod %>% stepAIC(trace = F)
summary(step.model)
prob.step <- step.model %>% predict(test_set,type ="response")
predicted.class2 <- ifelse(prob.step>0.5,1,0)
mean(predicted.class2==test_set$target)

model_check <- glm(target~.,data =heart_data,family = binomial)

prob.check <- predict(model_check,type ="response")
my_data <- heart_data %>% select_if(is.numeric)
predictors <- colnames(my_data)
my_data <- my_data%>% mutate(logit = log(prob.check/(1-prob.check)))
%>%
gather(key = "predictors",value = "predicted.value",-logit)

#plotting the graph for cheking the linearity

ggplot(my_data,aes(x =logit,y =predicted.value)) + geom_point() +
geom_smooth(method ="loess") +
theme_classic() + theme_bw()+facet_wrap(~predictors,scale ="free_y")

/////////////////// CarSeats//////////////////////
rm(list = ls())
install.packages("MASS")
install.packages("psych")
install.packages("Boruta")
library(Boruta)
################################## Cleaning the R Environment and
Loading Libraries ##################################
rm(list = ls())
setwd("C:/OldStuff/")
wants <- c("readxl", "plyr", "data.table", "dplyr", "tidyr",
"stringi", "stringr", "lubridate",
"BLPestimatoR", "dummies", "zoo", "car", "caret", "MASS",
"caTools", "Boruta",
"lmtest")
has <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])
lapply(wants, require, character.only = TRUE)
rm("wants","has")

carSeats <- read.csv("Carseats.csv")

## User defined functions

calculateRMSE <- function(df, actual, predicted)
{
MSE <- sum((df[, actual] - df[, predicted])^2)/nrow(df)
RMSE <- round(sqrt(MSE), 3)
RMSE
}

## Variable Selection using Stepwise Regression

variableSelectionUsingStepWiseRegression <- function(modelData,
targetVar) {
baseFormula <- as.formula(paste0(targetVar, " ~ 1"))
base.mod <- lm(baseFormula, data= modelData) # base intercept only
model
allFormula <- as.formula(paste0(targetVar, " ~ ."))
all.mod <- lm(allFormula, data= modelData) # full model with all
predictors
stepMod <- step(base.mod, scope = list(lower = base.mod, upper =
all.mod), direction = "both", trace = 1, steps = 1000) # perform step-
wise algorithm
shortlistedVars <- setdiff(names(stepMod$model), targetVar)
shortlistedVars
}

calculateRSquared <- function(model, y) {

#y <- as.numeric(model$x)
y <- y[!is.na(y)]
id <- which(!is.na(y))
moy <- mean(y)
N <- length(y)
fittedVal <- y[id] - as.numeric(model$residuals)[id]
p <- length(model$coefficients)-1
SSres <- sum((y-fittedVal)^2, na.rm=TRUE)
SStot <-sum((y-moy)^2, na.rm=TRUE)
RSquared <- round((1-(SSres/SStot)), 4)
AdjustedRsquared <- round((1-(((1-RSquared)*(N-1))/(N-p-1))), 4)
modelAIC <- model$aic
return(data.frame(RSquared, AdjustedRsquared))
}

## Plotting Histogram on the Age variable to decide on the number of

Bins
Age <- carSeats$Age
hist_Age <- hist(Age, breaks = 5, xlim = c(min(Age), max(Age)), col =
"Steelblue3", right = F)

# Inference:- We can see that there is steep jump from the first bin
to the second bin,
# third to fourth bin and fourth to fifth bin.
# Therefore we will bin Age variable into 4 bins, 0-29, 30-49, 50-59,
60+

## Binning the Age variable as per the inference from the histogram
carSeats$ageGroup <- cut(carSeats$Age, breaks=c(0, 29, 49, 59, 1000),
labels=c("lessthan30","30to49","50to59",
"60+"))
carSeats$ageGroup <- as.character(carSeats$ageGroup)

## Log Transformation of Population

carSeats$log_Population <- log(carSeats$Population)

## Making education as a factor variable

carSeats$Education <- as.factor(carSeats$Education)

## Keeping only relevant variables in the data

delVars <- c("Population", "Age") #deleting Population and Age since
we are using the transformation of them
carSeats <- carSeats[, !colnames(carSeats) %in% delVars]

## Variable Selection using Boruta which is based on Random Forest

Algorithm
set.seed(456)
boruta <- Boruta(Sales~., data = carSeats, doTrace = 2)
print(boruta)
vars_Boruta <- getSelectedAttributes(boruta, withTentative = F)
vars_Boruta

## Variable Selection using Stepwise Regression

vars_Stepwise <- variableSelectionUsingStepWiseRegression(carSeats,
"Sales")
vars_Stepwise

#If we compare the results from Boruta and Stepwise we can see that,
there are 7
# common variables chosen by both the methods. Stepwise method chooses
one
# additional variable

## Splitting the data into Train and Test

sample_size <- floor(0.8 * nrow(carSeats))
set.seed(123)
trainIndicator <- sample(seq_len(nrow(carSeats)), size = sample_size)
trainData <- carSeats[trainIndicator, ]
testData <- carSeats[-trainIndicator, ]

## Approach 1: Model Fitting using the variables selected by Stepwise

method
targetVar <- "Sales"
Formula <- as.formula(paste0(targetVar," ~ ", paste(vars_Stepwise,
collapse = " + ")))
Fit_Stepwise <- lm(Formula, data = trainData)
summary(Fit_Stepwise)

# Checking for Multicollinearity using VIF

vif(Fit_Stepwise) # GVIF<2 suggests that the model doesn't suffer from
Multicollinearity

# Checking for Heteroscedasticity using BP Test

bptest(Fit_Stepwise) # p-value>0.05 suggests that the residuals are
Homoscedastic

# Checking for Auto Correlation

dwtest(Fit_Stepwise) # dw ~ 2 suggests that there is not much auto-
correlation among error terms

# Checking variable importance

varImportance_Stepwise <- varImp(Fit_Stepwise)
varImportance_Stepwise$Vars <- rownames(varImportance_Stepwise)
rownames(varImportance_Stepwise) <- NULL
varImportance_Stepwise$Overall <-
round(varImportance_Stepwise$Overall, 2)
varImportance_Stepwise <- varImportance_Stepwise[order(-
varImportance_Stepwise$Overall), c("Vars", "Overall")]

## Approach 2: Model Fitting using the variables selected by Boruta

method
targetVar <- "Sales"
Formula <- as.formula(paste0(targetVar," ~ ", paste(vars_Boruta,
collapse = " + ")))
Fit_Boruta <- lm(Formula, data = trainData)
summary(Fit_Boruta)

# Checking for Multicollinearity using VIF

vif(Fit_Boruta) # GVIF<2 suggests that the model doesn't suffer from
Multicollinearity

# Checking for Heteroscedasticity using BP Test

bptest(Fit_Boruta) # p-value>0.05 suggests that the residuals are
Homoscedastic

# Checking for Auto Correlation

dwtest(Fit_Boruta) # dw ~ 2 suggests that there is not much auto-
correlation among error terms

# Checking variable importance

varImportance_Boruta <- varImp(Fit_Boruta)
varImportance_Boruta$Vars <- rownames(varImportance_Boruta)
rownames(varImportance_Boruta) <- NULL
varImportance_Boruta$Overall <- round(varImportance_Boruta$Overall, 2)
varImportance_Boruta <- varImportance_Boruta[order(-
varImportance_Boruta$Overall), c("Vars", "Overall")]

## Final Model: Fitting using only significant variables

targetVar <- "Sales"
vars <- c("CompPrice", "Income", "Advertising", "Price", "ShelveLoc",
"ageGroup")
Formula <- as.formula(paste0(targetVar," ~ ", paste(vars, collapse = "
+ ")))
Fit <- lm(Formula, data = trainData)
summary(Fit)

# Checking for Multicollinearity using VIF

vif(Fit) # GVIF<2 suggests that the model doesn't suffer from
Multicollinearity

# Checking for Heteroscedasticity using BP Test

bptest(Fit) # p-value>0.05 suggests that the residuals are
Homoscedastic

# Checking for Auto Correlation

dwtest(Fit) # dw ~ 2 suggests that there is not much auto-correlation
among error terms

# Checking variable importance

varImportance <- varImp(Fit)
varImportance$Vars <- rownames(varImportance)
rownames(varImportance) <- NULL
varImportance$Overall <- round(varImportance$Overall, 2)
varImportance <- varImportance[order(-varImportance$Overall),
c("Vars", "Overall")]

## Checking for RMSE on the Train Data

trainData$Sales_Pred_Stepwise <- fitted(Fit_Stepwise)
trainData$Sales_Pred_Boruta <- fitted(Fit_Boruta)
trainData$Sales_Pred_FinalModel <- fitted(Fit)

trainRMSE_Stepwise <- calculateRMSE(trainData, "Sales",

"Sales_Pred_Stepwise")
trainRMSE_Boruta <- calculateRMSE(trainData, "Sales",
"Sales_Pred_Boruta")
trainRMSE_FinalModel <- calculateRMSE(trainData, "Sales",
"Sales_Pred_FinalModel")

## Prediction on the test data using all 3 models

pred_Stepwise <- predict(Fit_Stepwise, testData)
testData$Sales_Pred_Stepwise <- pred_Stepwise

pred_Boruta <- predict(Fit_Boruta, testData)

testData$Sales_Pred_Boruta <- pred_Boruta
pred_FinalModel <- predict(Fit, testData)
testData$Sales_Pred_FinalModel <- pred_FinalModel

testRMSE_Stepwise <- calculateRMSE(testData, "Sales",

"Sales_Pred_Stepwise")
testRMSE_Boruta <- calculateRMSE(testData, "Sales",
"Sales_Pred_Boruta")
testRMSE_FinalModel <- calculateRMSE(testData, "Sales",
"Sales_Pred_FinalModel")

## Preparing Model Diagnostics Output for Comparison

outputData_Stepwise <- calculateRSquared(Fit_Stepwise, trainData[,
targetVar])
outputData_Stepwise$Train_RMSE <- trainRMSE_Stepwise
outputData_Stepwise$Test_RMSE <- testRMSE_Stepwise
outputData_Stepwise$VarSelectionMethod <- "Stepwise"

outputData_Boruta <- calculateRSquared(Fit_Boruta, trainData[,

targetVar])
outputData_Boruta$Train_RMSE <- trainRMSE_Boruta
outputData_Boruta$Test_RMSE <- testRMSE_Boruta
outputData_Boruta$VarSelectionMethod <- "Boruta"

outputData_FinalModel <- calculateRSquared(Fit, trainData[,

targetVar])
outputData_FinalModel$Train_RMSE <- trainRMSE_FinalModel
outputData_FinalModel$Test_RMSE <- testRMSE_FinalModel
outputData_FinalModel$VarSelectionMethod <- "FinalModel"

outputData <- rbind(outputData_FinalModel, outputData_Stepwise,

outputData_Boruta)
rm(outputData_Stepwise, outputData_Boruta, outputData_FinalModel)

///////////////// CArdioFitness////////////
r<-"hello"
r
w="C://Users//00002998//CardioGoodFitness.csv"
ds=read.csv(w,1)
ds
library(caTools)
set.seed(123)
library(caTools)
set.seed(123)
split=sample.split(ds$Income, SplitRatio = 2/3)
training_set=subset(ds,split==TRUE)
test_set=subset(ds,split=FALSE)
training_set
dim(ds)
dim(training_set)
names(ds)
str(ds)
ds[1:10,]
ds[1:10,"Sepal.Length"]
ds[1:10,"Product"]
summary(ds)
table(ds$Product)
table(Product, Gender)
table(ds$Product, ds$Gender)
boxplot(Age~Product, horizontal=TRUE, col=c("Green","Red"))
boxplot(ds$Age~ds$Product, horizontal=TRUE, col=c("Green","Red"))
attach(ds)
table(Product, Gender)
by (m, INDICES=Product, FUN = summary)
summary(ds)
by(ds, indices=Product, FUN=summary)
by(ds, INDICES=Product, FUN=summary)
rpivotTable(ds)
install.rpivottable
library(lattice)
histogram(~Miles|factor(Product),data=ds)
cor(Miles,Usage)
Model=lm(Miles~Usage, data=ds)
summary(Model)
pie(table(Product, Gender))
pie(table(Product))
plot(density(Product))
plot(density(Salary))
plot(density(Income))
file="C://Users//00002998//CardioGoodFitness.csv"MyDataset=read.csv(fi
le,header=TRUE)
file="C://Users//00002998//CardioGoodFitness.csv"
MyDataset=read.csv(file,header=TRUE)
MyDataset
plot(Income, Gender)
plot(Income, Product)
plot(ds)
library("party")
library(ctree)
ds_ctree=ctree(Product~, Product)
file="C://Users//00002998//iris_flowers_new.csv"
iris=read.csv(file,1)
iris
iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data=iris)
install.packages("party")
library("party")
iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data=iris)
iris_ctree <- ctree(Flower ~ Sepal.Length + Sepal.Width + Petal.Length
+ Petal.Width, data=iris)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iri)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iri)
iris_ctree <- ctree(flower ~ sepal.Length + sepal.Width +
petal.Length,data=iris)
iris_ctree <- ctree(flower ~ sepal.length + sepal.width +
petal.length,data=iris)
iris_ctree
plot(iris_ctree)
boxplot()
plot(jitter(sepal.length), jitter(sepal.width))
attach(iris)
plot(jitter(sepal.length), jitter(sepal.width))
smoothScatter(sepal.length, sepal.width)
distMatrix <- as.matrix(dist(iris[,1:4]))
heatmap(distMatrix)
iris_pdf=pdf("iris.pdf")
heat=heatmap(distMatrix)
iris_pdf=pdf("heat.pdf")
iris_pdf
set.seed(1234)
split=sample.split(iris, SplitRatio = 2/3)
training_set=subset(iris,split==TRUE)
test_set=subset(iris,split=FALSE)
split=sample.split(iris, SplitRatio = 2/3)
training_set=subset(iris,split==TRUE)
test_set=subset(iris,split=FALSE)
training_set
myFormula=flower ~ sepal.length + sepal.width + petal.length,data=iris
myFormula=flower ~ sepal.length + sepal.width + petal.length
myf_ctree=ctree(myFormula, data=training_set)
table(predict(myf_ctree), training_set$flower)
print(myf_ctree)
plot(myf_ctree)
plot(myf_ctree,type="simple")
test_pred=predict(myf_ctree, newdata = test_set)
table(test_pred, test_set$flower)
setwd ("C:/Users/00002998")
/
setwd ("C:/Users/00002998/R Programming")
getwd()
setwd("D:/sahubackup/GL/R Programming")
getwd()

//////////////ANOVA///////////////
library(plot3D)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
x
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, clab = c("Sepal", "Width (cm)"))
scatter3D(x, y, z, bty = "f", colkey = FALSE, main ="bty= 'f'")
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
# User defined
scatter3D(x, y, z, pch = 18, bty = "u", colkey = FALSE,
main ="bty= 'u'", col.panel ="steelblue", expand =0.4,
col.grid = "darkblue")
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
library(ggplot2)
qplot(iris$sepal.length,iris$sepal.width, col="red",
fill="iris$sepal.width")
ggplot(iris, aes(x=sepal.length))+
geom_histogram(aes(y=iris$sepal.width), binwidth=5,
col="black",fill="red")
library(dplyr)
tbl_df(iris)
glimpse(iris)
View(iris)
iris %>%
group_by(variety) %>%
summarise(avg = mean(sepal.width)) %>%
arrange
library(tidyr)
gather(iris, "new", "n", 2:4)
slice(iris, 10:15)
summarise_each(iris, funs(mean))
count(iris, variety, wt = sepal.length)
summarise(iris, avg = mean(sepal.length))
group_by(iris, variety)
library(stringr)
str_detect(iris$variety, "z")
library(MASS)
data=Cars93
data
attach(data)
names(data)
ggplot(data, aes(x=Price))+
geom_bar(binwidth=5, col="red", fill="blue")

ggplot(data, aes(x=RPM))+
geom_histogram(aes(y=..density..), col="red", fill="Black")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x=Weight))+
geom_histogram(aes(y=..density..),binwidth = 2, colour = "black",
fill = "white")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +

geom_text(stat = "count")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +

geom_boxplot() +
guides(fill = FALSE)

ggplot(data, aes(x = Horsepower, y = MPG.city, colour = Cylinders)) +

geom_point()

ggplot(data, aes(x = Horsepower, y = MPG.city)) +

geom_point() +
facet_wrap( ~ Cylinders, ncol = 3)
View(Cars93)

df <- data_frame(x.to = c( 2, 3, 3, 2,-2,-3,-3,-2),

y.to = c( 3, 2,-2,-3,-3,-2, 2, 3),
x = 0,
y = 0,
x_gt_y = abs(x.to) > abs(y.to),
xy_sign = sign(x.to*y.to) == 1,
x_gt_y_equal_xy_sign = x_gt_y == xy_sign)
df

ggplot(df) +
geom_segment(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y, linetype = !xy_sign),
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal()

ggplot() +
geom_curve(data = df %>% filter(x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
geom_curve(data = df %>% filter(!x_gt_y_equal_xy_sign),
aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature =-0.75, angle = 45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)
ggplot(df) +
geom_curve(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y_equal_xy_sign),
curvature = 0.75, angle = -45,
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal() +
theme(legend.position = "bottom") +
xlim(-4, 4) + ylim(-4,4)

df2 <- data.frame(supp=rep(c("VC", "OJ"), each=3),

dose=rep(c("D0.5", "D1", "D2"),2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))

head(df2)

ggplot(data=df2, aes(x=dose, y=len, group=supp)) +

geom_line(linetype="dashed", color="blue", size=1.2)+
geom_point()

data_summary <- function(data, varname, groupnames){

require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}

df3 <- data_summary(ToothGrowth, varname="len",

groupnames=c("supp", "dose"))
head(df3)

ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +

geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()

# Use position_dodge to move overlapped errorbars horizontally

ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +
geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1,
position=position_dodge(0.05)) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()
View(ToothGrowth)

long_DF <- DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)

head(long_DF, 24)

DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)

DF %>% gather(Quarter, Revenue, -Group, -Year)
DF %>% gather(Quarter, Revenue, 3:6)
DF %>% gather(Quarter, Revenue, Qtr.1, Qtr.2, Qtr.3, Qtr.4)
square.it=function(x)
{
square=x*x
return(Square)
}
hist(c(3, 5, 10, 10, 11, 12, 12, 14, 14, 14, 19))
pnorm(100,0.03)

#Kaggle DS LR
train=read.csv("D:/sahubackup/GL/LR/Kaggle/train.csv")
train
test=read.csv("D:/sahubackup/GL/LR/Kaggle/test.csv")
test
library(ggplot2)
numberofNAs=length(which(is.na(train)==T))
if(numberofNAs>0)
{
cat('Number of missing values found: ', numberofNAs)
cat('\nRemoving missing values...')
train = train[complete.cases(train), ]
}
#par(mfrow=c(2,1), size=)
boxplot(train$x)
boxplot(test$x)
boxplot(train$x, main='X', sub=paste('Outliers: ',
boxplot.stats(train$x)$out))
boxplot(test$x, main='X', sub=paste('Outliers: ',
boxplot.stats(test$x)$out))

av=read.csv("D:/sahubackup/GL/LR/av.csv")
av

levels(av$group)
acgrp=ordered(av$group, levels=c("ctrl","trt1","trt2"))
acgrp
library(dplyr)
grp=group_by(av,group) %>%
summarise(count=n(),
mean=mean(weight, na.rm = TRUE,
sd=sd(weight,na.rm = TRUE)))
grp=av$group
grp
factor.grp=factor(grp)
factor.grp
attach(av)
grp = c("Mon","Fri","Mon","Wed","Wed","Sat")
factor.wday = factor(wday)
factor.wday

library(ggplot2)
ggplot(av, aes(x=group, y=weight, fill=group))+
geom_boxplot(order = c("ctrl", "trt1", "trt2"))+
theme_classic() +
theme(legend.position = "none")
boxplot(av)
boxplot(weight~group)
av
avgrp
names(av)
ggplot(av, aes(x = group, y = weight)) + ## Simple Box Plot -
Midsize has high variance
geom_boxplot()

res.aov <- aov(weight ~ group, data = av)

res.aov
summary(res.aov)

TukeyHSD(res.aov)
library(multcomp)
summary(glht(res.aov, linfct = mcp(group = "Tukey")))
pairwise.t.test(av$weight, av$group,
p.adjust.method = "BH")
library(car)
leveneTest(weight ~ group, data = av)

#two way anova

tanv=read.csv("D:/sahubackup/GL/ml-latest-small/tav.csv")
tanv
attach(tanv)
unique(dose)
table(supp,dose)

ggplot(tanv, aes(x = dose, y = len,color=supp)) + ## Simple Box

Plot - Midsize has high variance
geom_boxplot()

ggplot(tanv, aes(x = dose, y = len,color=supp))+

geom_boxplot(len ~ supp * dose)

res.aov2 <- aov(len ~ supp + dose, data = tanv)

summary(res.aov2)
res.aov3 <- aov(len ~ supp * dose, data = tanv)
res.aov3 <- aov(len ~ supp + dose + supp:dose, data = tanv)
summary(res.aov3)

models <- regsubsets(Fertility~.,

really.big=TRUE,method="exhaustive",data = swissdata, nvmax = 5)
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)

get_model_formula <- function(id, object, outcome){

models <- summary(object)$which[id,-1]

predictors <- names(which(models == TRUE))

predictors <- paste(predictors, collapse = "+")

as.formula(paste0(outcome, "~", predictors))

}
model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models, "Fertility")
%>%
map(get_cv_error, data = swissdata) %>%
unlist()
cv.errors
Price=Factor(PL,levels=c(1,2,3), labels=c(high,med,low))
interraction.plot(Price, Advertisement, Sales, col=c(r,y,b))
library(MASS)
library(reshape2)
data=data("Boston", package="MASS")
data
View(Boston)
pairs(Boston)
bosmelt <- melt(Boston, id="crim")
ggplot(bosmelt, aes(x=value, y=crim))+
facet_wrap(~variable, scales="free")+
geom_point()

confint(SLM, "PerOcc")

///////////////ANOVA-DentalHardness/////////
my_data=read.csv("D:/sahubackup/GL/Dental Hardness.csv")
attach(my_data)
View(my_data)

my_data$dentist<-factor(my_data$dentist)
my_data$method<-factor(my_data$method)
my_data$alloy<-factor(my_data$alloy)
my_data$temperature<-factor(my_data$temperature)

hist(my_data[my_data$temperature==1500,]$hardness)
hist(my_data[my_data$temperature==1600,]$hardness)
hist(my_data[my_data$temperature==1700,]$hardness)

shapiro.test(my_data[my_data$temperature==1500,]$hardness)$p.value
shapiro.test(my_data[my_data$temperature==1600,]$hardness)$p.value
shapiro.test(my_data[my_data$temperature==1700,]$hardness)$p.value
str(my_data)

library(car)
leveneTest(my_data$hardness~my_data$temperature)

aov1 <- aov(my_data$hardness~my_data$temperature)

summary(aov1)
power.anova.test(groups=3,n=30,between.var = 41089, within.var =
20792, sig.level = 0.05)
kruskal.test(my_data$hardness~my_data$temperature)
t.test(my_data$hardness, mu=720, alternative="t", conf.level=0.95)
wilcox.test(my_data$hardness, mu=720)
power.t.test(n=90,delta=-21.778,sd=145.7678,
alternative="two.sided",sig.level=0.05)
power.t.test(delta=-21.778,sd=145.7678,power=.8,
alternative="two.sided",sig.level=0.05)

t.test(my_data[my_data$alloy==1,]$hardness,my_data[my_data$alloy==2,]$
hardness,paired = FALSE)
wilcox.test(my_data[my_data$alloy==1,]$hardness,my_data[my_data$alloy=
=2,]$hardness,paired=FALSE)
pooledSD <- (((45-1)*(14688.12)+(45-1)*(25886.43))/(45+45-2))^0.5
pooledSD
power.t.test(n=45,delta=-68.58,pooledSD,
alternative="two.sided",sig.level=0.05)
power.t.test(power=0.8,delta=-68.58,sd=142.4334,
alternative="two.sided",sig.level=0.05)

#Test
aov1 <- aov(my_data$hardness~my_data$method)
summary(aov1)

hist(my_data[my_data$method==1,]$hardness)
hist(my_data[my_data$method==2,]$hardness)
hist(my_data[my_data$method==3,]$hardness)

shapiro.test(my_data[my_data$method==1,]$hardness)$p.value
shapiro.test(my_data[my_data$method==2,]$hardness)$p.value
shapiro.test(my_data[my_data$method==3,]$hardness)$p.value
# 2 are normal distribution and 1 is Not normal
leveneTest(my_data$hardness~my_data$method)
#variances are not equal
#both fail, so going for NonParametric
kruskal.test(my_data$hardness~my_data$method)

hist(my_data[my_data$dentist==1,]$hardness)
hist(my_data[my_data$dentist==2,]$hardness)
hist(my_data[my_data$dentist==3,]$hardness)
hist(my_data[my_data$dentist==4,]$hardness)
hist(my_data[my_data$dentist==5,]$hardness)

shapiro.test(my_data[my_data$dentist==1,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==2,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==3,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==4,]$hardness)$p.value
shapiro.test(my_data[my_data$dentist==5,]$hardness)$p.value
# 4 are normal distribution and 1 is Not normal
leveneTest(my_data$hardness~my_data$dentist)
#variances are not equal
#both fail, so going for NonParametric
kd=kruskal.test(my_data$hardness~my_data$dentist)
summary(kd)

//////
dataset$Age <- ifelse(is.na(dataset$Age),
ave(dataset$Age, FUN = function(x)
mean(x, na.rm = TRUE)),
dataset$Age)

//////////////Missing Value/////////////////////
##
======================================================================
=========================================
## EXPLORATORY DATA ANALYTICS
##
======================================================================
=========================================

## REFERENCES:
## An Introduction to Data Cleaning with R - Edwin de Jonge and Mark
Van Der Loo
## https://cran.r-project.org/doc/contrib/de_Jonge+van_der_Loo-
Introduction_to_data_cleaning_with_R.pdf

data = airquality

## Explore Structure and Summary of Input data

str(data)
summary(data)

##
======================================================================
=======================================
## Univariate Analysis
##
======================================================================
=========================================
par(mfrow = c(2,1))

hist(data$Ozone, main = "Ozone Distribution", xlab = "Ozone")

boxplot(data$Ozone, horizontal = TRUE)
boxplot.stats(data$Ozone)

hist(data$Solar.R, main = "Solar.R Distribution", xlab = "Solar.R")

boxplot(data$Solar.R, horizontal = TRUE)
boxplot.stats(data$Solar.R)

hist(data$Wind, main = "Wind Distribution", xlab = "Wind")

boxplot(data$Wind, horizontal = TRUE)
boxplot.stats(data$Wind)

hist(data$Temp, main = "Temp Distribtion", xlab = "Temp")

boxplot(data$Temp, horizontal = TRUE)
boxplot.stats(data$Temp)

##
======================================================================
=======================================
## Bivariate Analysis
##
======================================================================
=========================================

plot(data)

##
======================================================================
=========================================
## EXPLORATORY DATA ANALYTICS - MISSING VALUES TREATMENT
##
======================================================================
=========================================

## Options Available:
## 1. Remove records having missing values
## 2. Impute values

## For now, we will omit Day and Month which are categorical variables
data1 = data[-c(5,6)]
summary(data1)

## Randomly insert 10 missing values in Wind and Temp columns

n = nrow(data1)
n
set.seed(100)
for(i in 3:ncol(data1)) {
data1[sample(1:n, 10, replace = FALSE), i] = NA
}

summary(data1) ## Note 10 Missing values introduced in Wind and Temp

## Let us examine the rows with missing values - Incomplete Rows

data1[!complete.cases(data1),]

attach(data1)

## GUIDELINES:
## A safe maximum threshold for missing values in a particular column
is 5%.
## If missing data for a column > 5%, we need to consider leaving out
that variable

## Build a function to calculate percentage of missing values in

Columns and Rows
pMiss = function(x){
sum(is.na(x))/length(x)*100
}

## Find Percentage of missing values in each column

col_miss = apply(data1,2,pMiss) ## 2 is for Columns
col_miss

## OBSERVATIONS:
## Ozone has nearly 25% missing values

## Find Percentage of missing values in each Row

row_miss = apply(data1,1,pMiss) ## 1 is for Rows
row_miss

## OBSERVATIONS:
## Row 5 has 50% missing variables - will not be of much value
data1[5,]

## Identify rows with high missing values

high_miss_rows = data1[row_miss > 25,]
high_miss_rows ## 10 Rows have more than 25% missing values

## Keep only the rows with less than 30% missing values
low_miss_rows = data1[row_miss < 30,]
low_miss_rows

## Using mice package

library(mice)
md.pattern(data1)

## IMPUTING MISSING VALUES USING mice PACKAGE

## If any variable contains missing values, the mice package regresses
## it over the other variables and predicts the missing values.
## Some of the available models in mice package are:
## * PMM (Predictive Mean Matching) - suitable for numeric
variables
## * logreg(Logistic Regression) - suitable for categorical
variables with 2 levels
## * polyreg(Bayesian polytomous regression) - suitable for
categorical variables with more than or equal to two levels
## * Proportional odds model - suitable for ordered categorical
variables with more than or equal to two levels

## NOTE: FOR THIS EXERCISE, WE ARE USING data1 DATASET WITH 153 ROWS -
NOT low_miss_rows!!!!
data_imputes = mice(data1, m = 5, maxit = 7, seed = 500)
## m: Number of times model should run, maxit: Max number of
iterations

summary(data_imputes)

## Methods mice used for imputing

data_imputes$method

## Since only numeric variables had missing values, mice used pmm
method

## What are the values determined for each variable?

data_imputes$imp

## Now let us first examine the values mice determined for Ozone
data_imputes$imp$Ozone

## Before inserting the values, let us look at rows 5, 10 and 25 -

They all have missing values
data1[c(5,10,25),]

## Which of the 5 datasets created should we use?

stripplot(data_imputes, pch = 20, cex = 1.2)

## OBSERVATIONS:
## For Temp, Iteration 3 and 4 is the one with most imputed values in
the middle
## which does not fit well with observed values - we can therefore
ignore the 3rd imputed dataset

library(funModeling) ## Ref:
https://blog.datascienceheroes.com/exploratory-data-analysis-data-
preparation-with-funmodeling/

plot_num(data) ## Overall Iteration 5 fits the original distribution

the closest
## Impute Data using 'complete' function from mice package
imputed_data = complete(data_imputes, 5)

## Let us look at the same rows 5, 10 and 25

imputed_data[c(5,10,25),]

data[c(9,25,31,40,48,55,60), -(5:6)] ## Original Data without Month

and Day columns

data1[c(9,25,31,40,48,55,60),] ## Data with Randomly inserted

missing values

imputed_data[c(9,25,31,40,48,55,60),] ## Data with Imputation for

missing values

summary(data)

summary(imputed_data)

## Inspecting distribution of original and imputed data

#xyplot(data_imputes, Ozone ~ Wind + Temp + Solar.R,
# pch = 18, cex =1)

densityplot(data_imputes)

## OBSERVATIONS:
## Red lines - Density of imputed data for each imputed dataset
## Blue line - Density of observed data
## We expect the Red and Blue distributions to be similar
## - Ozone and Wind has similar patterns for Red and Blue lines
## - Temp has similar patterns for Red and Blue lines - However
Observed data (Blue)
## has more variation than some of the Imputed datasets
## - For Solar.R, imputed values for 4 datasets are close to Observed
- Can ignore the other imputed dataset

##
======================================================================
====================
## MISING VALUE TREATMENT USING KNN METHOD FROM VIM PACKAGE
##
======================================================================
====================

summary(data[,3:4]) ## Dataframe with no missing values for Wind and

Temp

summary(data1[,3:4]) ## Dataframe with missing values introduced for

Wind and Temp
data1[!complete.cases(data1[,3:4]),]

library(VIM)

## Impute missing values using KNN method

data2 = kNN(data1)

summary(data2)

data[c(9,25,31,40,48,55,60), -(5:6)] ## Original Data without Month

and Day columns

data1[c(9,25,31,40,48,55,60),] ## Data with Randomly inserted

missing values

data2[c(9,25,31,40,48,55,60),1:4] ## Data with Imputation for

missing values

plot_num(data[,1:4])

plot_num(data2)

## =====================================
## Working with Messy Data
## =====================================

## Let us create a dataset with outliers

age = c(21,2,18,221,34)
group = c("adult","child","adult","elderly","child")
height = c(6.0, 3,5.7,5, -7)
status = c("single", "married", "married","widowed", "married")
yearsmarried = c(-1,0,20,2,3)

## Build a dataframe using the vectors created above

people = data.frame(age,group,height,status,yearsmarried)
people

## OBSERVATIONS:
## yearsmarried cannot be negative
## A 2 year old child cannot be married
## An 18 year old adult cannot be married for 20 years
## 221 year old married for 2 years???!!!
## 34 year old Child who is -7 ft tall??

library(editrules)

## Set Age Rule

E_age = editset(c("age >= 0", "age <= 120"))

## Which records violate the Age Rule?

violatedEdits(E_age,people) ## Record 4 violates the second age rule
<=120

## All Rules can be maintained on an external Text File

E_file = editfile("EDA_Rules_Edit.txt")

rule_violations = violatedEdits(E_file, people)

rule_violations

plot(rule_violations)

## Graph showing interconnection between Variables and Restrictions

plot(E_file)

## OBSERVATIONS:
## - Two cases of Categorical violations involving Group and Status
## - If status == 'married', group should be 'adult' or
'elderly'
## - Rule violated in records 2 and 5
## - Two cases of Mixed Rules violations involving Status,
YearsMarried and Age
## - If status == 'married', age - yrsMarried >= 17
## - Rule violated in records 2 and 3

##
======================================================================
==============
## WORKING WITH DIFFERENT UNITS
##
======================================================================
==============

name = c("A","B","C","D","E")
height = c(170.00,1.74, 70.00, 168.00, 5.91)
unit = c("cm","m","inch","cm","ft")

physical = data.frame(name,height,unit)
physical

library(deducorrect)

## Convert all values into Meters ("m")

R = correctionRules("Length_Convertions.txt")
R

## Apply correction rules to data

cor = correctWithRules(R, physical)
cor

cor$corrected
## *****************************************
## Working with Dates - Also covered in Intro to R
## *****************************************

## In R, Dates and Times are captured using POSIXct (Continuous Time -

Number of seconds) and POSIXlt (List Time)
## Base Date: 1 January 1970

Sys.time()

class(Sys.time())

time.list = as.POSIXlt(Sys.time())
unlist(time.list)

y <- strptime("01/02/2018",format="%d/%m/%Y")
y

weekdays(y) ## Find day of the week

y$wday ## Thursday is fourth day of the week

## R is clever with dates!!

start_end_dates = c("2016 2 Mon", "2017 6 Fri", "2018 10 Tue") ## Mon
of Week 2, Fri of Week 6 and Tue of Week 10
strptime(start_end_dates, format = "%Y %W %a")

## Difference between two dates

difftime("2014-02-06", "2016-08-15")
as.numeric(difftime("2014-02-06", "2016-08-15"))

## Generating Sequence of dates from 2015-11-04 to 2015-11-15

incrementing by 1 day
dates.seq = seq(as.POSIXlt("2015-11-04"), as.POSIXlt("2015-11-15"), "1
day")
dates.seq
class(dates.seq)

dates.seq1 = seq(as.POSIXlt("2015-11-04"), by = "day", length = 11)

dates.seq1
class(dates.seq1)

## Working with Dates using Lubridate package

library(lubridate)

dates = c("15/12/2013", "15 December 13", "It happened on 15 02 '13")

dmy(dates) ## All dates above converted to common format!!

## How does R know whether it is 1913 or 2013??

## Years 00 to 68 will be 20xx
## Years 69 to 99 will be 19xx

## Other limitations
dmy("15 Feb 2018")

dmy("15 Febr 2018") ## Error since POSIX standard expects Feb and
not Febr

##
======================================================================
================
## CHARACTER MANIPULATION USING stringr PACKAGE
##
======================================================================
================

## CHARCTER MANIPULATION INCLUDES:

## - Remove pre-pending or trailing white spaces
## - Pad strings to certain width
## - Transform to upper/lower case
## - Search for strings containing certain patterns (substrings)
## - Approximate matching procedures based on string distances

library(stringr)

## Remove white spaces before and after text

str_trim(" hello world ")

## Remove white spaces left of text

str_trim(" hello world ", side = "left")

## Remove white spaces right of text

str_trim(" hello world ", side = "right")

## Add spaces before text

str_pad("hello world", width = 20, side = "left", pad = " ") # width
is total length including padding

## Add zeros before numbers

str_pad(112, width = 6, side = "left", pad = 0) # Padding numbers
for fields like IDs

## Convert string to ALL CAPS

toupper("hello world")

## Convert string to all lower

tolower('HELLO WORLD')

##
======================================================================
==================
## Approximate String Matching
##
======================================================================
==================

gender = c("M", "male","F", "Female", "fem.")

## Find all values with "m" in gender

grepl("m", gender) # Gives logical output

grep("m", gender) # Gives row or position number

## Ignore case "M" and "m" should be treated the same

grepl("m", gender, ignore.case = TRUE) # Gives logical output

grep("m", gender, ignore.case = TRUE) # Gives row or position number

## Look for any value that starts with "M" or "m"

grepl("^m", gender, ignore.case = TRUE) # Notice ^ before the seach
parameter

## Working with Special Characters

gender = c("M", "male","F", "Female", "fem.","Male**","F+","male/")

grepl("+", gender, fixed = TRUE) # Search for '+'

grepl(".", gender, fixed = TRUE) # Search for '.'
grepl("*", gender, fixed = TRUE) # Search for '*'

////////////////PCA///////////////////////////
library(nFactors)
attach(cs)
csi=cs[,c("Sales","CompPrice","Income","Advertising","Population","Pri
ce","Age","Education")]
csi
ev = eigen(cor(csi)) # get eigenvalues
ev
EigenValue=ev$values
EigenValue
Factor=c(1,2,3,4,5,6,7,8)
Scree=data.frame(Factor,EigenValue)
plot(Scree,main="Scree Plot", col="Blue",ylim=c(0,4))
lines(Scree,col="Red")
library(psych)
Unrotate=principal(csi, nfactors=3, rotate="none")
print(Unrotate,digits=3)
UnrotatedProfile=plot(Unrotate,row.names(Unrotate$loadings))
Rotate=principal(csi,nfactors=3,rotate="varimax")
print(Rotate,digits=3)
RotatedProfile=plot(Rotate,row.names(Rotate$loadings),cex=1.0)
///////////////Practice////////////
library(plot3D)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
x
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, clab = c("Sepal", "Width (cm)"))
scatter3D(x, y, z, bty = "f", colkey = FALSE, main ="bty= 'f'")
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
# User defined
scatter3D(x, y, z, pch = 18, bty = "u", colkey = FALSE,
main ="bty= 'u'", col.panel ="steelblue", expand =0.4,
col.grid = "darkblue")
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)
library(ggplot2)
qplot(iris$sepal.length,iris$sepal.width, col="red",
fill="iris$sepal.width")
ggplot(iris, aes(x=sepal.length))+
geom_histogram(aes(y=iris$sepal.width), binwidth=5,
col="black",fill="red")
library(dplyr)
tbl_df(iris)
glimpse(iris)
View(iris)
iris %>%
group_by(variety) %>%
summarise(avg = mean(sepal.width)) %>%
arrange
library(tidyr)
gather(iris, "new", "n", 2:4)
slice(iris, 10:15)
summarise_each(iris, funs(mean))
count(iris, variety, wt = sepal.length)
summarise(iris, avg = mean(sepal.length))
group_by(iris, variety)
library(stringr)
str_detect(iris$variety, "z")
library(MASS)
data=Cars93
data
attach(data)
names(data)
ggplot(data, aes(x=Price))+
geom_bar(binwidth=5, col="red", fill="blue")
ggplot(data, aes(x=RPM))+
geom_histogram(aes(y=..density..), col="red", fill="Black")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x=Weight))+
geom_histogram(aes(y=..density..),binwidth = 2, colour = "black",
fill = "white")+
geom_density(alpha=.2, fill = "pink")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +

geom_text(stat = "count")

ggplot(data, aes(x = Type, y=Price,fill = Type)) +

geom_boxplot() +
guides(fill = FALSE)

ggplot(data, aes(x = Horsepower, y = MPG.city, colour = Cylinders)) +

geom_point()

ggplot(data, aes(x = Horsepower, y = MPG.city)) +

geom_point() +
facet_wrap( ~ Cylinders, ncol = 3)
View(Cars93)

df <- data_frame(x.to = c( 2, 3, 3, 2,-2,-3,-3,-2),

y.to = c( 3, 2,-2,-3,-3,-2, 2, 3),
x = 0,
y = 0,
x_gt_y = abs(x.to) > abs(y.to),
xy_sign = sign(x.to*y.to) == 1,
x_gt_y_equal_xy_sign = x_gt_y == xy_sign)
df

ggplot(df) +
geom_segment(aes(x = x, y = y, xend = x.to, yend = y.to, color =
x_gt_y, linetype = !xy_sign),
arrow = arrow(length = unit(0.25,"cm"))) +
coord_equal()

df2 <- data.frame(supp=rep(c("VC", "OJ"), each=3),

dose=rep(c("D0.5", "D1", "D2"),2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))

head(df2)

ggplot(data=df2, aes(x=dose, y=len, group=supp)) +

geom_line(linetype="dashed", color="blue", size=1.2)+
geom_point()

data_summary <- function(data, varname, groupnames){

df3 <- data_summary(ToothGrowth, varname="len",

groupnames=c("supp", "dose"))
head(df3)

ggplot(df3, aes(x=dose, y=len, group=supp, color=supp)) +

geom_errorbar(aes(ymin=len-sd, ymax=len+sd), width=.1) +
geom_line() + geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()

# Use position_dodge to move overlapped errorbars horizontally

long_DF <- DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)

head(long_DF, 24)

DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)

DF %>% gather(Quarter, Revenue, -Group, -Year)
DF %>% gather(Quarter, Revenue, 3:6)
DF %>% gather(Quarter, Revenue, Qtr.1, Qtr.2, Qtr.3, Qtr.4)

square.it=function(x)
{
square=x*x
return(Square)
}
hist(c(3, 5, 10, 10, 11, 12, 12, 14, 14, 14, 19))
pnorm(100,0.03)

av=read.csv("D:/sahubackup/GL/LR/av.csv")
av

res.aov <- aov(weight ~ group, data = av)

res.aov
summary(res.aov)

#two way anova

tanv=read.csv("D:/sahubackup/GL/ml-latest-small/tav.csv")
tanv
attach(tanv)
unique(dose)
table(supp,dose)

ggplot(tanv, aes(x = dose, y = len,color=supp)) + ## Simple Box

Plot - Midsize has high variance
geom_boxplot()

ggplot(tanv, aes(x = dose, y = len,color=supp))+

geom_boxplot(len ~ supp * dose)

res.aov2 <- aov(len ~ supp + dose, data = tanv)

summary(res.aov2)
res.aov3 <- aov(len ~ supp * dose, data = tanv)
res.aov3 <- aov(len ~ supp + dose + supp:dose, data = tanv)
summary(res.aov3)
models <- regsubsets(Fertility~.,
really.big=TRUE,method="exhaustive",data = swissdata, nvmax = 5)
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)

get_model_formula <- function(id, object, outcome){

models <- summary(object)$which[id,-1]

predictors <- names(which(models == TRUE))

predictors <- paste(predictors, collapse = "+")

as.formula(paste0(outcome, "~", predictors))

confint(SLM, "PerOcc")

//////////////////Practice1/////////////////
install.packages("ggplot2")
library(lattice)
t.test(30, mu=0.29)

seandsd <- function(x){

seresult <- semean(x)
sdresult <- sd(x)
# Store results in a vector with names
vec <- c(seresult, sdresult)
names(vec) <- c("SE","SD")
return(vec)
}
x <- rnorm(100, mean=20, sd=4)
x
seandsd(x)

mylist <- list(a=1:10, txt=c("hello","world"),

dfr=data.frame(x=c(2,3,4),y=c(5,6,7)))
mylist

install.packages("olsrr")
pnorm(0.8,100,10,1)

df = data.frame(group=c("Group 1","Group 1","Group 2","Group 2","Group

2"), subgroup =
c("A","A","A","A","B"),value = c(2,2.5,1,2,1.5))
df
sum1=aggregate(value~group,FUN=sum,data=df)
sum1
library(dplyr)
df %>% group_by(group) %>% summarise(value=sum(value)) %>%
as.data.frame()
df %>% group_by(group) %>% summarize(value = mean(value)) %>%
as.data.frame()
df %>% group_by(group) %>% summarize(value = sum(value[value>2])) %>%
as.data.frame()
install.packages("devtools")
xmat <- cbind(rnorm(100, -3), rnorm(100, -1), rnorm(100, 1),
rnorm(100, 3))
xmat
plot(xmat[,1], type='l')
lines(xmat[,2], col="red")
lines(xmat[,3], col="green")
lines(xmat[,4], col="blue")
matplot(xmat, type='l')

nterms=as.integer(readline(prompt="how many numbers="))

n1=0
n2=1
count=2
if(is.na(nterms))
{
print("enter a positive num=")
}else{
if(nterms==1){print(n1)
}else{
print(n1)
print(n2)
while(count<nterms)
{
nth=n1+n2
print(nth)
n1=n2
n2=nth
count=count+1
}
}
}

recurse_fibonacci <- function(n) {

if (n <= 1) {
return(n)
} else {
return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
}
}

nterms = as.integer(readline(prompt="How many terms? "))

if(is.na(nterms)) {
print("nter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}

Fibonacci <- numeric(10)

Fibonacci[1] <- Fibonacci[2] <- 1
for (i in 3:10) Fibonacci[i] <- Fibonacci[i - 2] + Fibonacci[i - 1]
print("First 10 Fibonacci numbers:")
print(Fibonacci)

fibb <- function (n) {

if (n < 3) {
return(c(0,1)[n])
} else {
return(fibb(n - 2) + fibb(n -1))
}
}
fibb(5)

ap=read.csv("D:/sahubackup/GL/AirPassengers.csv")
start=head(ap,1)
start
end=tail(ap,1)
end
library(tseries)
library(forecast)

boxplot(dd~cycle(dd),xlab="Date", ylab = "Passenger Numbers (1000's)"

,main ="Monthly Air Passengers Boxplot from 1949 to 1961")
library(stringr)
library(rvest)
library(xml2)
url="https://www.dezyre.com/data-science-in-r-programming-tutorial/r-
tutorial-importing-data-from-web"
t_link=read_html(url)
t_link
transcript=t_link %>%html_nodes("#main-content") %>% html_text()
markers=str_locate_all(transcript,pattern="R|JSON")
transcript
head(markers)
production_data = readHTMLTable(url, which=2)

library(XML)

u = "http://en.wikipedia.org/wiki/World_population"

tables = readHTMLTable(u)
names(tables)

tables[[2]]

tmp = tables[[2]]

genres2 <- as.data.frame(tstrsplit(genres, '[|]',

type.convert=TRUE),stringsAsFactors=FALSE)

for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
genre_matrix2 <- as.data.frame(genre_matrix[-1,],
stringsAsFactors=FALSE) #remove first row, which was the genre list
for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
}

search_matrix <- cbind(movie[,1], substr(movie[,2],1,nchar(movie[,2])-

6), years, genre_matrix2)
colnames(search_matrix) <- c("movieId", "title", "year", genre_list)

for(i in 1:5)
{
for(j in 1:2)
{
print(i*j);
}
}
movies=read.csv("D:/sahubackup/GL/ml-latest-small/movies.csv")
movies
df_genres=data.frame(movies[,3])
df_genres
i=nrow(movies$title)
i
str(movies)
nr=nrow(movies)
nr
mat_mov_gen=data.matrix(movies,rownames.force = NA)
mat_mov_gen
genres <- as.data.frame(movies$genres, stringsAsFactors=FALSE)
genres
library(data.table)
genres2 <- as.data.frame(tstrsplit(genres[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE)
genres2
colnames(genres2) <- c(1:10)
colnames(genres2)
genre_list <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western")
genre_matrix <- matrix(0,9743,18)
genre_matrix
genre_matrix[1,] <- genre_list
genre_matrix[1,]
colnames(genre_matrix) <- genre_list
colnames(genre_matrix)
for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
genre_matrix2 <- as.data.frame(genre_matrix[-1,],
stringsAsFactors=FALSE)
genre_matrix2

for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
}
years <- as.data.frame(movies$title, stringsAsFactors=FALSE)
library(data.table)
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))}
yt=movies$title
yt
class(yt)
ytc=as.character(yt)
ytc
class(ytc)
years <- as.data.frame(substr(substrRight(substrRight(ytc, 6),5),1,4))
years
search_matrix <- cbind(movies[,1], substr(movies[,2],1,nchar(ytc)-6),
years, genre_matrix2)
search_matrix
colnames(search_matrix) <- c("movieId", "title", "year", genre_list)
colnames(search_matrix)
write.csv(search_matrix, "search.csv")
search_matrix <- read.csv("search.csv", stringsAsFactors=FALSE)
search_matrix
subset(search_matrix, Action == 1 & year == 1995)$title
ratings=read.csv("D:/sahubackup/GL/ml-latest-small/ratings.csv")
links=read.csv("D:/sahubackup/GL/ml-latest-small/links.csv")
tags=read.csv("D:/sahubackup/GL/ml-latest-small/tags.csv")
binaryratings <- ratings
binaryratings

for (i in 1:nrow(binaryratings)){
if (binaryratings[i,3] > 3){
binaryratings[i,3] <- 1
}
else{
binaryratings[i,3] <- -1
}
}

binaryratings2 <- dcast(binaryratings, movieId~userId, value.var =

"rating", na.rm=FALSE)
binaryratings2

for (i in 1:ncol(binaryratings2)){
binaryratings2[which(is.na(binaryratings2[,i]) == TRUE),i] <- 0
}
binaryratings2 = binaryratings2[,-1]
binaryratings2

#Remove rows that are not rated from movies dataset

movieIds <- length(unique(movies$movieId))
movieIds
ratingmovieIds <- length(unique(ratings$movieId))
ratingmovieIds #10325
movies2 <- movies[-which((movies$movieId %in% ratings$movieId) ==
FALSE),]
movies2
rownames(movies2) <- NULL
rownames(movies2)

#Remove rows that are not rated from genre_matrix2

genre_matrix3 <- genre_matrix2[-which((movies$movieId %in%
ratings$movieId) == FALSE),]
rownames(genre_matrix3) <- NULL
#Calculate dot product for User Profiles
result = matrix(0,18,668) # here, 668=no of users/raters, 18=no of
genres
for (c in 1:ncol(binaryratings2)){
for (i in 1:ncol(genre_matrix3)){
result[i,c] <- sum((genre_matrix3[,i]) * (binaryratings2[,c]))
#ratings per genre
}
}

for (c in 1:ncol(result)){
for (i in 1:nrow(result)){
if (result[i,c] < 0){
result[i,c] <- 0
}
else {
result[i,c] <- 1
}
}
}

library(reshape2)
#Create ratings matrix. Rows = userId, Columns = movieId
ratingmat <- dcast(ratings, userId~movieId, value.var = "rating",
na.rm=FALSE)
ratingmat <- as.matrix(ratingmat[,-1]) #remove userIds
library(recommenderlab)
install.packages("registry")
library(registry)
ratingmat <- as(ratingmat, "realRatingMatrix")

# Determine how similar the first four users are with each other
# create similarity matrix
similarity_users <- similarity(ratingmat[1:4, ],
method = "cosine",
which = "users")
as.matrix(similarity_users)
image(as.matrix(similarity_users), main = "User similarity")
# compute similarity between
# the first four movies
similarity_items <- similarity(ratingmat[, 1:4], method =
"cosine", which = "items")
as.matrix(similarity_items)
image(as.matrix(similarity_items), main = "Item similarity")

# Exploring values of ratings:

vector_ratings <- as.vector(ratingmat@data)
unique(vector_ratings) # what are unique values of ratings

table_ratings <- table(vector_ratings) # what is the count of each

rating value
table_ratings

library(ggplot2)
views_per_movie <- colCounts(ratingmat) # count views for each movie

table_views <- data.frame(movie = names(views_per_movie),

views = views_per_movie) # create dataframe
of views
table_views <- table_views[order(table_views$views,
decreasing = TRUE), ] # sort by
number of views

ggplot(table_views[1:6, ], aes(x = movie, y = views)) +

geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_discrete(labels=subset(movies2, movies2$movieId ==
table_views$movie)$title) +
ggtitle("Number of views of the top movies")

#Visualizing the matrix:

image(ratingmat, main = "Heatmap of the rating matrix") # hard to
read-too many dimensions
image(ratingmat[1:10, 1:15], main = "Heatmap of the first rows and
columns")
image(ratingmat[rowCounts(ratingmat) > quantile(rowCounts(ratingmat),
0.99),
colCounts(ratingmat) > quantile(colCounts(ratingmat),
0.99)],
main = "Heatmap of the top users and movies")

#Normalize the data

ratingmat_norm <- normalize(ratingmat)
image(ratingmat_norm[rowCounts(ratingmat_norm) >
quantile(rowCounts(ratingmat_norm), 0.99),
colCounts(ratingmat_norm) >
quantile(colCounts(ratingmat_norm), 0.99)],
main = "Heatmap of the top users and movies")

#Create UBFC Recommender Model. UBCF stands for User-Based

Collaborative Filtering
recommender_model <- Recommender(ratingmat_norm,
method = "UBCF",
param=list(method="Cosine",nn=30))

model_details <- getModel(recommender_model)

model_details$data

recom <- predict(recommender_model,

ratingmat[1],
n=10) #Obtain top 10 recommendations for 1st user in
dataset
recom

#recc_matrix <- sapply(recom@items,

# function(x){ colnames(ratingmat)[x] })
#dim(recc_matrix)

recom_list <- as(recom,

"list") #convert recommenderlab object to readable
list

#Obtain recommendations
recom_result <- matrix(0,10)
for (i in 1:10){
recom_result[i] <- as.character(subset(movies,
movies$movieId ==
as.integer(recom_list[[1]][i]))$title)
}

# Evaluation:
evaluation_scheme <- evaluationScheme(ratingmat,
method="cross-validation",
k=5, given=3,
goodRating=5) #k=5 meaning a 5-
fold cross validation. given=3 meaning a Given-3 protocol
evaluation_results <- evaluate(evaluation_scheme,
method="UBCF",
n=c(1,3,5,10,15,20))
eval_results <- getConfusionMatrix(evaluation_results)[[1]]
eval_results

##star triangle
for(i in 1:5)
{
for(j in 1:2)
{
print("*");
}
}

pascalTriangle <- function(h) {

for(i in 0:(h-1)) {
s <- ""
for(k in 0:(h-i)) s <- paste(s, " ", sep="")
for(j in 0:i) {
s <- paste(s, sprintf("%3d ", choose(i, j)), sep="")
}
print(s)
}
}
pascalTriangle(5)

ap=read.csv("D:/sahubackup/GL/AirPassengers.csv")
start=head(ap,1)
start
end=tail(ap,1)
end
frequency(ap)
findfrequency(ap)
class(ap)
tsap=ts(ap,start=c(1949,1),end=c(1960,12),frequency=365)
tsap
class(tsap)
findfrequency(tsap)
frequency(tsap)
plot(ap)
train <- ap[,1:132]
dim(ap)
library(caTools)
set.seed(123)
split=sample.split(ap,SplitRatio=0.8)
train=subset(ap,split==T)
test=subset(ap,split==F)
train
test
train_ts=ts(train, start=1949, frequency=12)
test_ts=ts(test, start=1960, frequency=12)
train_ts
test_ts
library(ggplot2) #Data Visualisation
library(ggfortify) #Data Visualisation
library(forecast)
decomposedres <- decompose(ap)
plot(decomposedres)
mean_baseline <- meanf(train_ts, h=12)
plot(mean_baseline, type="l")
lines(ap)
accuracy(mean_baseline, test)
sma <- ma(train_ts, order=12)
plot(sma, xlim=c(1949, 1960), ylim=c(0, 600), col="red")
lines(train)
pnorm(20,0.1,1)
pnorm(0.997,0.996,0.0033)
pnorm(15,0.6,15)
pnorm(40,65.16,10)-pnorm(50,65.16,10)
qnorm(0.99, mean = 65.16, sd=10)
pnorm(0.998,0.9563,0.0189)-pnorm(0.997,0.9563,0.0189)

1-dpois(0,lambda=3)
dpois(2,lambda=3)+dpois(3,lambda=3)+dpois(4,lambda=3)
dpois(6,lambda=4)
dpois(1,0.15)+dpois(0,0.15)
dpois(10,lambda=10)
pnorm(20.08,20.05,0.02,100)-pnorm(20.03,20.05,0.02,100)
pnorm(20.01,20.05,0.02,100)
pnorm(3.69,3.25,0.6)-pnorm(2.75,3.25,0.6)
dbinom(0,3,1/6)
dbinom(6,9,0.6)
dbinom(6,10,0.45)
pnorm(178000, 168000, 6324.55) - pnorm(158000, 168000, 6324.55)

ap=read.csv("D:/sahubackup/GL/Food Nutrition.csv")
library(ggplot2)
#par(margin(5,5,1,5),cex.lab=1.2, cex.axis=0.9)

par(mfrow=c(1,3))
plot(ap$Protein_.g.,col="blue", fill="red" )
barplot(ap$Carbohydrt_.g.,col="red")
pie(ap$Water_.g., main="Piechart", radius=1)

sub_data=ap[which(ap$Water_.g.>=30.0),]
sub_data
library(dplyr)
by_shr=group_by(ap,ap$Shrt_Desc)
by_shr
attach(ap)
sub_data1=subset(ap, Water_.g.>30.0)
sub_data1

sub_data1_ord=arrange(sub_data1,desc(Water_.g.))
sub_data1_ord

sub_data1_fil=filter(sub_data1, Protein_.g.>30,Lipid_Tot_.g.>26 )
sub_data1_fil

par(mar=c(5,5,2,5), cex.lab=1.2, cex.axis=0.9)

with(sub_data1, plot(Protein_.g., Water_.g., type='p',
ylim=c(0,20), lwd=2, col="blue",
xlab="Protein_.g",
ylab="Water_.g."))
boxplot(sub_data1$Water_.g.)
boxplot(sub_data1$Protein_.g.)
scatter.smooth(sub_data1$Protein_.g.,sub_data1$Water_.g.)

add = function(a,b)
{
a+b
}
add(3,4)

for(i in 1:5)
{
for(j in 1:2)
{
print(i*j);
}
}

f_c=function(f)
{
(9/5)*(f+32)
}
f_c(32)

for(i in 1:2)
{
for(j in 101:110)
{
print(i+j)
}
}

sq=function(a,b)
{
a*a+b*b
}
sq(3,4)

recurse_fibonacci <- function(n) {

if (n <= 1) {
return(n)
} else {
return(recurse_fibonacci(n-1) + recurse_fibonacci(n-2))
}
}

nterms = as.integer(readline(prompt="How many terms? "))

if(is.na(nterms)) {
print("nter a positive integer")
} else {
print("Fibonacci sequence:")
for(i in 0:(nterms-1)) {
print(recurse_fibonacci(i))
}
}
3

Fibonacci <- numeric(10)

Fibonacci[1] <- Fibonacci[2] <- 1
for (i in 3:10) Fibonacci[i] <- Fibonacci[i - 2] + Fibonacci[i - 1]
print("First 10 Fibonacci numbers:")
print(Fibonacci)

fibb <- function (n) {

if (n < 3) {
return(c(0,1)[n])
} else {
return(fibb(n - 2) + fibb(n -1))
}
}
fibb(5)

nterms=as.integer(readline(prompt="how many numbers="))

s1=lapply(sub_data1[,3:5],mean)
s1

s2=sapply(sub_data1[,3:5],mean)
s2

s3 = c("This", "is", "a", "test", "for", "sapply", "function")

sapply(s3, nchar)

tapply(sub_data1$Water_.g.,sub_data1$Shrt_Desc, mean)
install.packages("plot3D")
library(plot3D)
detach(ap)
data=read.csv("D:/sahubackup/GL/iris.csv")
head(data)
x <- sep.l <- iris$sepal.length
y <- pet.l <- iris$petal.length
z <- sep.w <- iris$sepal.width
scatter3D(x, y, z, ..., colvar = z, col = NULL, add = FALSE)
text3D(x, y, z, labels, colvar = NULL, add = FALSE)
points3D(x, y, z, ...)
lines3D(x, y, z, ...)
scatter2D(x, y, colvar = NULL, col = NULL, add = FALSE)
text2D(x, y, labels, colvar = NULL, col = NULL, add = FALSE)

pnorm(44,40,3,lower.tail = FALSE)
qnorm(0.9087,40,3)

qnorm(0.025,0,1)
qnorm(0.975)
qnorm(0.025)
qnorm(0.95,0,1)
qnorm(0.95,0,1,lower.tail = FALSE)
qnorm(0.005,0,1)
qnorm(0.01,0,1,lower.tail = FALSE)
qnorm(0.01,0,1)
qnorm(0.95,0,1,lower.tail = FALSE)
pnorm(-1.25,0,1)
pnorm(2.5,0,1,lower.tail = FALSE)
pnorm(3.16,lower.tail = FALSE)

nterms=as.integer(readline(prompt="how many numbers="))

n1=0
n2=1
count=2
if(is.na(nterms))
{
print("enter a positive num=")
}else{
if(nterms==1){print(n1)
}else{
print(n1)
print(n2)
if(count==nterms)
{
nth=n1+n2
print(nth)
n1=n2
n2=nth
count=count+1
}
}
}

len <- 10
fibvals <- numeric(len)
fibvals[1] <- 1
fibvals[2] <- 1
for (i in 3:len) {
fibvals[i] <- fibvals[i-1]+fibvals[i-2]
}
/

nterms=(readline(prompt="how many numbers="))

len=nterms
fibvals <- numeric(len)
fibvals[1] <- 1
fibvals[2] <- 1
for (i in 3:len) {
fibvals[i] <- fibvals[i-1]+fibvals[i-2]
}
print(fibvals)

////////////SmartEDA/////////////
nstall.packages("ISLR")
library("ISLR")
install.packages("SmartEDA")
library("SmartEDA")
## Load sample dataset from ISLR pacakge
Carseats= ISLR::Carseats
Overview of the data - Type = 1
ExpData(data=Carseats,type=1)

# Structure of the data - Type = 2

ExpData(data=Carseats,type=2)
ExpNumStat(Carseats,by="A",gp=NULL,Qnt=seq(0,1,0.1),MesofShape=2,Outlier=TRUE
,round=2,Nlim=10)
plot1 <- ExpNumViz(Carseats,target=NULL,nlim=10,Page=c(2,2),sample=8,theme=)
plot1[[1]]
ExpCTable(Carseats,Target=NULL,margin=1,clim=10,nlim=NULL,round=2,bin=NULL,pe
r=T)
plot2 <- ExpCatViz(Carseats,target=NULL,col
="slateblue4",clim=10,margin=2,Page = c(2,1),sample=4)
plot2[[1]]
summary(Carseats[,"Price"])
ExpNumStat(Carseats,by="A",gp="Price",Qnt=seq(0,1,0.1),MesofShape=1,Outlier=T
RUE,round=2)
#Note: sample=8 means randomly selected 8 scatter plots
#Note: nlim=4 means included numeric variable with unique value is more than
4
plot3 <-
ExpNumViz(Carseats,target="Price",nlim=4,scatter=FALSE,fname=NULL,col="green"
,Page=c(2,2),sample=8)
plot3[[1]]
#Note: sample=8 means randomly selected 8 scatter plots
#Note: nlim=4 means included numeric variable with unique value is more than
4
plot31 <-
ExpNumViz(Carseats,target="US",nlim=4,scatter=TRUE,fname=NULL,Page=c(2,1),sam
ple=4)
plot31[[1]]
##bin=4, descretized 4 categories based on quantiles
ExpCTable(Carseats,Target="Price",margin=1,clim=10,nlim=NULL,round=2,bin=4,pe
r=F)
ExpNumStat(Carseats,by="GA",gp="Urban",Qnt=seq(0,1,0.1),MesofShape=2,Outlier=
TRUE,round=2)
plot4 <-
ExpNumViz(Carseats,target="Urban",type=1,nlim=NULL,fname=NULL,col=c("darkgree
n","springgreen3","springgreen1"),Page=c(2,2),sample=8)
plot4[[1]]
ExpCTable(Carseats,Target="Urban",margin=1,clim=10,nlim=NULL,round=2,bin=NULL
,per=F)
ExpCatStat(Carseats,Target="Urban",result =
"IV",clim=10,nlim=5,bins=10,Pclass="Yes",plot=FALSE,top=20,Round=2)
et4 <- ExpCatStat(Carseats,Target="Urban",result =
"Stat",clim=10,nlim=5,bins=10,Pclass="Yes",plot=FALSE,top=20,Round=2)
varimp <- ExpCatStat(Carseats,Target="Urban",result =
"Stat",clim=10,nlim=5,bins=10,Pclass="Yes",plot=TRUE,top=10,Round=2)
plot5 <-
ExpCatViz(Carseats,target="Urban",fname=NULL,clim=5,col=c("slateblue4","slate
blue1"),margin=2,Page = c(2,1),sample=2)
plot5[[1]]
options(width = 150)
CData = ISLR::Carseats
qqp <- ExpOutQQ(CData,nlim=10,fname=NULL,Page=c(2,2),sample=4)
qqp[[1]]
ExpParcoord(CData,Group=NULL,Stsize=NULL,Nvar=c("Price","Income","Advertising
","Population","Age","Education"))
ExpParcoord(CData,Group="ShelveLoc",Stsize=c(10,15,20),Nvar=c("Price","Income
"),Cvar=c("Urban","US"))
ExpParcoord(CData,Group="ShelveLoc",Nvar=c("Price","Income"),Cvar=c("Urban","
US"),scale=NULL)
ExpParcoord(CData,Group="US",Nvar=c("Price","Income"),Cvar=c("ShelveLoc"),sca
le="std")
ExpParcoord(CData,Group="ShelveLoc",Stsize=c(10,15,20),Nvar=c("Price","Income
","Advertising","Population","Age","Education"))
ExpParcoord(CData,Group="US",Stsize=c(15,50),Cvar=c("ShelveLoc","Urban"))
ExpCustomStat(Carseats,Cvar="Urban",Nvar=c("Age","Price"),stat=c("mean","coun
t"),gpby=TRUE,dcast=F)
ExpCustomStat(Carseats,Cvar="Urban",Nvar=c("Age","Price"),stat=c("mean","coun
t"),gpby=TRUE,dcast=T)
ExpCustomStat(Carseats,Cvar=c("Urban","ShelveLoc"),Nvar=c("Age","Price","Adve
rtising","Sales"),stat=c("mean"),gpby=FALSE,dcast=T)

 grid.arrange() and arrangeGrob() to arrange multiple ggplots on one page

 marrangeGrob() for arranging multiple ggplots over multiple pages.

Essential n8n Playbook
From Everand
Essential n8n Playbook
Leandro Calado
No ratings yet
R
No ratings yet
R
6 pages
R Code
No ratings yet
R Code
9 pages
Lab Manual - DSR
No ratings yet
Lab Manual - DSR
32 pages
Tài Liệu Không Có Tiêu Đề
No ratings yet
Tài Liệu Không Có Tiêu Đề
7 pages
CH 06
No ratings yet
CH 06
7 pages
Ex 10 - Decision Tree With Rpart and Fancy Plot and Cardio Data
No ratings yet
Ex 10 - Decision Tree With Rpart and Fancy Plot and Cardio Data
4 pages
Advanced R Data Analysis Training PDF
No ratings yet
Advanced R Data Analysis Training PDF
72 pages
R Code
No ratings yet
R Code
13 pages
Model Lab
No ratings yet
Model Lab
6 pages
4 TH
No ratings yet
4 TH
10 pages
Lab1 Revathy
No ratings yet
Lab1 Revathy
6 pages
Data Visualization
No ratings yet
Data Visualization
46 pages
R Programming
No ratings yet
R Programming
9 pages
(Practical) Programming With R
No ratings yet
(Practical) Programming With R
5 pages
2 R - Zajecia - 4 - Eng
No ratings yet
2 R - Zajecia - 4 - Eng
7 pages
BAN5
No ratings yet
BAN5
2 pages
UL2
No ratings yet
UL2
2 pages
Analysis Using Statistical: Introduction & Data Exploration
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
Lab File AD PDF
No ratings yet
Lab File AD PDF
25 pages
All Values in The First Column
No ratings yet
All Values in The First Column
7 pages
R Record-1
No ratings yet
R Record-1
57 pages
Assignment 2
No ratings yet
Assignment 2
13 pages
R Basics
No ratings yet
R Basics
18 pages
Rstudio Study Notes For PA 20181126
No ratings yet
Rstudio Study Notes For PA 20181126
6 pages
Finalproj Aml
No ratings yet
Finalproj Aml
69 pages
IntroR 2
No ratings yet
IntroR 2
18 pages
Home Credit Data
No ratings yet
Home Credit Data
6 pages
AMDA Practical - A048
No ratings yet
AMDA Practical - A048
35 pages
Assignment 2 PDF
No ratings yet
Assignment 2 PDF
9 pages
KrutikaKolhe 862467252 HW3
No ratings yet
KrutikaKolhe 862467252 HW3
14 pages
Exercise-9..Study and Implementation of Data Visulization With Ggplot
No ratings yet
Exercise-9..Study and Implementation of Data Visulization With Ggplot
1 page
R Studio Quality Management
No ratings yet
R Studio Quality Management
20 pages
DSAAct 6
No ratings yet
DSAAct 6
18 pages
RSCH8079 - Session 09 - Data Science With R
No ratings yet
RSCH8079 - Session 09 - Data Science With R
69 pages
Aman DA 111
No ratings yet
Aman DA 111
14 pages
Ggplot2 Cheatsheet
No ratings yet
Ggplot2 Cheatsheet
2 pages
FDA Assignment 4
No ratings yet
FDA Assignment 4
34 pages
Content: Dplyr, Readr, TM, Ggplot2/+ggforce/, Tidyr, Broom Dplyr
No ratings yet
Content: Dplyr, Readr, TM, Ggplot2/+ggforce/, Tidyr, Broom Dplyr
8 pages
Document Sans Titre
No ratings yet
Document Sans Titre
7 pages
Spatial Statistics in R
No ratings yet
Spatial Statistics in R
29 pages
Lab - 10
No ratings yet
Lab - 10
13 pages
Sahanashree Ex-2 ML
No ratings yet
Sahanashree Ex-2 ML
9 pages
20BDS0160 Lab3
No ratings yet
20BDS0160 Lab3
25 pages
Ggplot2 Slides
No ratings yet
Ggplot2 Slides
82 pages
Commands For Data Analysis Using R
No ratings yet
Commands For Data Analysis Using R
11 pages
3 Ggplot PDF
No ratings yet
3 Ggplot PDF
19 pages
Copy Entire Document Content in R Studio: R Script Compiled by Mr. Anup Sharma (Strictly To Be Used As Class Notes)
No ratings yet
Copy Entire Document Content in R Studio: R Script Compiled by Mr. Anup Sharma (Strictly To Be Used As Class Notes)
15 pages
R Program
No ratings yet
R Program
22 pages
Copy Entire Document Content in R Studio
No ratings yet
Copy Entire Document Content in R Studio
17 pages
No Ph.D. Game Design With Three.js
From Everand
No Ph.D. Game Design With Three.js
Nikiforos Kontopoulos
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
150+ C Pattern Programs
From Everand
150+ C Pattern Programs
Hernando Abella
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
How to a Developers Guide to 4k: Developer edition, #3
From Everand
How to a Developers Guide to 4k: Developer edition, #3
Xinc Cyberwizard
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
Charts & Diagrams Primer
From Everand
Charts & Diagrams Primer
Beam Vanwaardenberg
No ratings yet
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Apache Cassandra Developer Associate - Exam Practice Tests
From Everand
Apache Cassandra Developer Associate - Exam Practice Tests
Cristian Scutaru
No ratings yet
Subject: A Glance To Elasticsearch in The Era of Analytics and Machine Learning
No ratings yet
Subject: A Glance To Elasticsearch in The Era of Analytics and Machine Learning
8 pages
Wall Product Feature Enhancement Short Form BRD 4.10.20
No ratings yet
Wall Product Feature Enhancement Short Form BRD 4.10.20
38 pages
10 User Guide Dose Forms and Routes of Administration v1 0
No ratings yet
10 User Guide Dose Forms and Routes of Administration v1 0
8 pages
Internet of Spatial Things: A New Reference Model With Insight Analysis
No ratings yet
Internet of Spatial Things: A New Reference Model With Insight Analysis
18 pages
Xlri Apm Brochure
No ratings yet
Xlri Apm Brochure
14 pages
0 - Summary of Document History - v1 - 6
No ratings yet
0 - Summary of Document History - v1 - 6
1 page
Implementation Guide For Electronic Transmission of Individual Case Safety Reports (Icsrs)
No ratings yet
Implementation Guide For Electronic Transmission of Individual Case Safety Reports (Icsrs)
166 pages
Handling Imbalanced Data
No ratings yet
Handling Imbalanced Data
21 pages
Principal Components Analysis: Mathematical Development
No ratings yet
Principal Components Analysis: Mathematical Development
23 pages
1-Descriptive Statistics
No ratings yet
1-Descriptive Statistics
44 pages
T5 Moleculardescriptors Models PDF
No ratings yet
T5 Moleculardescriptors Models PDF
9 pages
T5 Moleculardescriptors Models PDF
No ratings yet
T5 Moleculardescriptors Models PDF
9 pages
Hud Hud Dabang Dabang Dabang Hud Hud Dabang Dabang Dabang Dabang ..'
No ratings yet
Hud Hud Dabang Dabang Dabang Hud Hud Dabang Dabang Dabang Dabang ..'
13 pages
1-Descriptive Statistics
No ratings yet
1-Descriptive Statistics
44 pages
Nlogit An R Package Presentation
No ratings yet
Nlogit An R Package Presentation
40 pages
Simple Linear Regression
No ratings yet
Simple Linear Regression
54 pages
Concentration
No ratings yet
Concentration
28 pages
MC Multiple Regression
No ratings yet
MC Multiple Regression
7 pages
MAT240
No ratings yet
MAT240
54 pages
Survey Adjustment
No ratings yet
Survey Adjustment
97 pages
QM-Course Handout - MM ZG515
No ratings yet
QM-Course Handout - MM ZG515
10 pages
10 12 28 04 39 13 850 Maris
No ratings yet
10 12 28 04 39 13 850 Maris
78 pages
Demand Pattern: 1. Plot The Demand and Share The Characteristics of Demand Pattern
No ratings yet
Demand Pattern: 1. Plot The Demand and Share The Characteristics of Demand Pattern
10 pages
Chapter Two
No ratings yet
Chapter Two
154 pages
Practice Questions Lecture 32-34
No ratings yet
Practice Questions Lecture 32-34
5 pages
Applied Maths-Unit5
No ratings yet
Applied Maths-Unit5
4 pages
Wine Quality Research Paper
100% (1)
Wine Quality Research Paper
3 pages
Uniform Probability Distribution Normal Probability Distribution Exponential Probability Distribution
100% (1)
Uniform Probability Distribution Normal Probability Distribution Exponential Probability Distribution
29 pages
MTH302 Final Term Solved MCQs With Reference
100% (1)
MTH302 Final Term Solved MCQs With Reference
30 pages
Dat 121 Term Paper Group 13
No ratings yet
Dat 121 Term Paper Group 13
15 pages
Gibbs Sampling Algorithm For MRFS: X, X, - . - , X T,, - . - , T T X I,, - . - , N X X T P X X X
No ratings yet
Gibbs Sampling Algorithm For MRFS: X, X, - . - , X T,, - . - , T T X I,, - . - , N X X T P X X X
2 pages
Data Distribution
No ratings yet
Data Distribution
18 pages
BPS651 Exercise V
50% (2)
BPS651 Exercise V
5 pages
What Is Noise?: John A. Scales and Roel Sniedert
No ratings yet
What Is Noise?: John A. Scales and Roel Sniedert
3 pages
Anova Script
No ratings yet
Anova Script
2 pages
Chapter 13 Experimental Design and Analysis of Variance PDF
No ratings yet
Chapter 13 Experimental Design and Analysis of Variance PDF
44 pages
TS Sproj Diesel Oil
No ratings yet
TS Sproj Diesel Oil
12 pages
CS 229, Summer 2020 Problem Set #1
No ratings yet
CS 229, Summer 2020 Problem Set #1
14 pages
SSM & Da All Unit Notes
No ratings yet
SSM & Da All Unit Notes
152 pages
2021 3 Ked Ktek A
No ratings yet
2021 3 Ked Ktek A
4 pages
Hypothesis
No ratings yet
Hypothesis
3 pages
Adc Module 3
No ratings yet
Adc Module 3
80 pages
Econometrics II
100% (1)
Econometrics II
4 pages
Hw4 Solutions
No ratings yet
Hw4 Solutions
6 pages