Datamining 2
Datamining 2
PRACTICAL FILE
Name: Swati Saini
Roll No.: 2102066
Course: B.sc.(H) Computer
science
Code:-
print(people)
library(editrules)
#Create a ruleset E that contain rules to check for the following conditions:
#4. If age is less than 18 the agegroup should be child, if age is between 18 and 65
# the agegroup should be adult, if age is more than 65 the agegroup should be elderly.
E<-editset(expression(
Age>=0,
Age<=150,
Age>yearsmarried,
if(Age<18) agegroup=='child',
if(Age>=65) agegroup=='elderly'
))
print(E)
ve<-violatedEdits(E,people)
summary(ve)
print(ve)
summary(ve,E)
plot(E)
plot(ve)
plot(E,layout=layout.circle)
Output Screen:-
Ques 2:-
Rules.txt:-
Sepal.Length > 0
Sepal.Width > 0
Petal.Length > 0
Petal.Width > 0
Sepal.Length < 30
Code:-
print(head(data,10))
print(summary(data))
str(data)
as.numeric(complete_cases)
n_complete<-sum(complete_cases)
print(n_complete)
percent_complete<- 100*n_complete/nrow(data)
is.na(data)<- sapply(data,is.infinite)
is.na(data)
library(editrules)
#iv)Determine how often each rule is broken (violatedEdits). Also summarize and plot the result.
print(Rules)
print(rules_violations)
summary(rules_violations)
#par(mar=c(3,3,3,3))
plot(rules_violations)
boxplot(data$Sepal.Length, horizontal=TRUE)
print(boxplot.stats(data$Sepal.Length))
Output Screen:-
Ques 3:-
Load the data from wine dataset. Check whether all attributes are standardized or not (mean is
0 and standard deviation is 1). If not, standardize the attributes. Do the same with Iris dataset.
Code:-
library(caret)
#----WINE DATASET-----
df_wine<-read.csv("wine.csv", header=FALSE)
View(df_wine)
str(df_wine)
summary(df_wine)
apply(df_wine,2,sd)
wine_pre<-preProcess(df_wine[,],method=c("center","scale"))
wine_standard<-predict(wine_pre,df_wine[,])
summary(wine_standard)
apply(wine_standard,2,sd)
#----IRIS DATASET-----
data("iris")
View(iris)
summary(iris)
apply(iris[,1:4],2,sd)
iris_standard<-predict(iris_pre, iris[,1:4])
summary(iris_standard)
apply(iris_standard,2,sd)
Output Screen:-
Ques 4:-
Code:-
View(receipt_df)
#Applying column names
id<- c(1:5)
print(df)
#after Preprocessing
head(receipt_df)
typeof(receipt_df)
library(arules)
library(arulesViz)
plot(rules)
rules<- sort(rules, by="support", decreasing = T)
inspect(head(rules))
plot(rules)
inspect(head(rules))
Output Screen:-
Ques 5:-
Use Naive bayes, K-nearest, and Decision tree classification algorithms and build classifiers.
Divide the data set into training and test set. Compare the accuracy of the different classifiers
under the following situations:
5.1 a) Training set = 75% Test set = 25% b) Training set = 66.6% (2/3rd of total), Test set =
33.3%
5.2 Training set is chosen by i) hold out method ii) Random subsampling iii) Cross-Validation.
Compare the accuracy of the classifiers obtained.
5.3 Data is scaled to standard format.
Code:-
#and build classifiers. Divide the data set into training and test set.
#situations:
data(iris)
print(summary(iris))
print(head(iris,5))
#library imports
library(caTools)
library(rpart)
#?rpart
library(e1071)
library(class)
library(rpart.plot)
library(caret)
#part a
#Hold-out Method
set.seed(143)
split<-sample.split(iris$Species,SplitRatio = 0.75)
training_set = subset(iris,split==TRUE)
testing_set = subset(iris,split==FALSE)
#summary(training_set)
#summary(testing_set)
#feature scaling
print(dim(training_set))
print(dim(testing_set))
#Naive bayes
print(classifier_naive)
print(y_pred)
#confusion matrix
print(cm)
print(confusionMatrix(cm))
#K-Nearest
cl = training_set$Species,
k=1)
print(classifier_knn)
#confusion matrix
print(cm)
#DECISION TREE
dtm<-rpart(Species~.,training_set,method = "class")
#plot(dtm)
p<-predict(dtm,testing_set,type = "class")
#print(confusionMatrix(testing_set[,5],p))
print(confusionMatrix(testing_set[,5],p)$table)
print(confusionMatrix(testing_set[,5],p)$overall["Accuracy"]*100)
#Random subsampling
print("Random Subsampling")
dtm_acc=list()
knn_acc=list()
naive_acc=list()
for(x in 1:5)
set.seed(123)
split<-sample.split(iris$Species,SplitRatio = 0.75)
train_set_rndm = subset(iris,split==TRUE)
test_set_rndm = subset(iris,split==FALSE)
dim(test_set_rndm)
dim(train_set_rndm)
dtm_rndm
#plot(dtm_rndm)
#rpart.plot(dtm_rndm)
p=predict(dtm_rndm , test_set_rndm,type="class")
confusionMatrix(test_set_rndm[,5],p)$table
dtm_acc[x] = (confusionMatrix(test_set_rndm[,5],p)$overall['Accuracy']*100)
classifier_knn
#confusion matrix
cm = table(test_set_rndm$Species, classifier_knn_rndm)
cm
misclasserror = mean(classifier_knn_rndm!=test_set_rndm$Species)
print(paste("Accuracy of knn model is :: ", 1 - misclasserror))
knn_acc[x] = confusionMatrix(test_set_rndm[,5],classifier_knn_rndm)$overall['Accuracy']*100
knn_acc
classifier_naive
predicted_y
table(predicted_y)
#confusion matrix
cm = table(test_set_rndm$Species, predicted_y)
print(cm)
confusionMatrix(cm)
naive_acc[x]=confusionMatrix(test_set_rndm$Species,predicted_y)$overall['Accuracy']*100
naive_acc
#cross-validation
print("Cross Validation")
#Naive Bayes
nb_model<-train(iris[,1:4],iris[,5],'nb',
print(nb_model)
#KNN
knn_model<-train(iris[,1:4],iris[,5],'knn',
print(knn_model)
#Decision Tree
Dtree_model<-train(iris[,1:4],iris[,5],'rpart',
print(Dtree_model)
#part b
print("Part b start Training set = 66.6% (2/3rd of total), Test set = 33.3% ")
#Hold-out Method
set.seed(143)
split<-sample.split(iris$Species,SplitRatio = 0.6666)
training_set = subset(iris,split==TRUE)
testing_set = subset(iris,split==FALSE)
#feature scaling
print(dim(training_set))
print(dim(testing_set))
#Naive bayes
print(classifier_naive)
print(y_pred)
#confusion matrix
cm <- table(testing_set$Species, y_pred)
print(cm)
print(confusionMatrix(cm))
#K-Nearest
cl = training_set$Species,
k=1)
print(classifier_knn)
#confusion matrix
print(cm)
#DECISION TREE
dtm<-rpart(Species~.,training_set,method = "class")
#rpart.plot(dtm)
p<-predict(dtm,testing_set,type = "class")
print(confusionMatrix(testing_set[,5],p))
#Random subsampling
print("Random Subsampling")
dtm_acc=list()
knn_acc=list()
naive_acc=list()
for(x in 1:5)
set.seed(123)
split<-sample.split(iris$Species,SplitRatio = 0.66)
train_set_rndm = subset(iris,split==TRUE)
test_set_rndm = subset(iris,split==FALSE)
dim(test_set_rndm)
dim(train_set_rndm)
train_scale_rndm <- scale(train_set_rndm[,1:4])
dtm_rndm
#plot(dtm_rndm)
#rpart.plot(dtm_rndm)
p=predict(dtm_rndm , test_set_rndm,type="class")
confusionMatrix(test_set_rndm[,5],p)$table
dtm_acc[x] = (confusionMatrix(test_set_rndm[,5],p)$overall['Accuracy']*100)
classifier_knn
#confusion matrix
cm = table(test_set_rndm$Species, classifier_knn_rndm)
cm
misclasserror = mean(classifier_knn_rndm!=test_set_rndm$Species)
knn_acc
classifier_naive
predicted_y
table(predicted_y)
#confusion matrix
cm = table(test_set_rndm$Species, predicted_y)
print(cm)
confusionMatrix(cm)
naive_acc[x]=confusionMatrix(test_set_rndm$Species,predicted_y)$overall['Accuracy']*100
naive_acc
#cross-validation
print("Cross Validation")
#Naive Bayes
print("Naive bayes-Cross Validation-part b")
nb_model<-train(iris[,1:4],iris[,5],'nb',
print(nb_model)
#KNN
knn_model<-train(iris[,1:4],iris[,5],'knn',
print(knn_model)
#DECISION TREE
Dtree_model<-train(iris[,1:4],iris[,5],'rpart',
print(Dtree_model)
Output Screen:-
PLOTS:-
Ques 6:-
Use Simple Kmeans, DBScan, Hierachical clustering algorithms for clustering. Compare the
performance of clusters by changing the parameters involved in the algorithms.
Code:-
df <- read.csv("datasets/HTRU2/HTRU_2.csv")
print(summary(df))
#View(df)
str(df)
library(ggplot2)
dfScaled
summary(dfScaled)
set.seed(234)
print(kmeans_model)
print(kmeans_model$cluster)
print(cm)
#library(caret)
#print(confusionMatrix(cm))
d= dist(dfScaled)
fitH = hclust(d,"ward.D2")
print(fitH)
plot(fitH)
rect.hclust(fitH,k=3,border = "blue")
cluster= cutree(fitH,3) #we can cut off the tree at the desired number of clusters using cutree
cluster
table(df[,9],cluster)
#install.packages("dbscan",dependencies = TRUE)
library(dbscan)
abline(h=0.7,col="red",Ity = 2)
fitD<-dbscan(dfScaled,eps=0.7,minPts = 5)
print(fitD)
plot(df,col=fitD$cluster)
table(df[,9],fitD$cluster)
Output Screen:-
Plots:-