ADBMS Journal
ADBMS Journal
Date:
CERTIFICATE
External Examiner
INDEX
4. Implementation of ETL
transformation with pentaho like
Copy data from source
(table/excel/oracle) and store it to
target (table/excel/oracle),
adding sequence, calculator,
Concatition of two field, splitting of
two field, number range, string
operations, Sorting Data, merge Join,
Transformation on tables, data
validations on table data
select emp_name,branch,sum(emp_sal)
from employee1
GROUP BY cube(emp_name,branch)
ORDER BY emp_name,branch;
select emp_name,branch,sum(emp_sal)
from employee1
GROUP BY rollup(emp_name,branch)
ORDER BY emp_name,branch;
Cricket
Q1) Create an ADT name_type with the attribute fname, mname and
lname to store the name details
Display Books having a prices in the range of 1000 to 2500 and published
after January 2018
Q4) Create object table hostel of hostel_type(with attributeshost_no,
host_name and type(boys/girls))
Create a table Student with attributes sid, sname, gender, year, hostel_info
referencing object table.
Display the details of all girl students.
2. Click on Input and drag and drop table input on the screen.
1. Double click on this table_input icon.
Click on OK.
Click on Run.
It is successfully done. Now for checking our output go in SQL PLUS and type
the table name which you have given in table_output.
Aim: Implementation of ETL transformation with Pentaho like Adding sequence.
Number range
Input table
Preview data
Number Range transformation as follows
Sql output
Aim: Implementation of ETL transformation with Pentaho like calculator.
Calculator
Click ok
Then go in SQL Select statement and select your table
Click ok
Then Preview->close->OK
Double Click on Calculator
Click OK
Double click on table Output.
Get all the Fields.
Execute-> OK
Then Run
Split
Click preview data
Click close
Select the transformation “SPLIT”
Add the changes as follows
Click ok
Output
Aim: Implementation of ETL transformation with Pentaho like Number Range.
Number range
Input table
Preview data
Number Range transformation as follows
Sql output
Aim: Implementation of ETL transformation with Pentaho like Sorting Data.
Sorting
Drag & Drop Input & output table, sort rows and add sequence from transformation.
Click ok
Double click on Add Sequence.
Click ok.
Double click on table output and give the name to your output table
click on
execute and then ok
And then Run.
Let’s see our output in SQL PLUS.
Run transformation
Output
Merge-join
1. Create new transformation and Drag & Drop data grid, sort and Merge join.
3. Add all the values in columns and click on preview then OK.
6. Double click on Stu_sort sort rows Add the column on which you want to perform sorting.
Click ok.
7. Double click on marks_sort sort rows Add the column on which you want to perform sorting.
NOTE: - We can perform inner join, left outer join, right outer join and full outer join by selecting join type.
9. Run the transformation.(Inner join)
10. Run the transformation.(Left Outer join)
Click ok then you can see the name of validation on left hand side corner double click on that.
Creating Variables
X<-c (2.3,5.9,4.5,6.2,8.5) or
Assign (“X”, c(2.3,5.9,4.5,6.2,8.5) ) or
c (2.3,5.9,4.5,6.2,8.5) -> X
Printing a variable
X #auto printing
print(X) #external printing
Creating vector
#using c() function
X<-c(1,2,3,4,5)
#using vector() function
Y<-vector(1,2,3,4)
Doing calculation
#calculator
2+3
a<-4-3
a
2^2
5*5
5/6
log(2)
sqrt(4)
factorial(5)
exp(8)
mode(5)
x<-c(2,6,4,9)
y<-c(1,6,4)
x+y
y<-c(1,6,4,7)
x+y
x-y
x*y
x/y
Creating vector
#create vector
a<-c(1,2,3)
typeof(a)
aa<-c(12.3,34.6)
aa
(as<-c(1,2))
print(as)
al<-vector(logical,10)
al<-vector(mode = "logical",10)
al
Creating data frame
#data frame
student_id<-c(1,2,3)
student_name<-c("k","l","h")
df<-data.frame(student_id,student_name)
df
df$student_id
df$student_name
nrow(df)
ncol(df)
names(df)
Creating matrix
#matrix
m<-matrix(c(1,2,3,4,5,6,7,8,9),nrow = 3,ncol = 3)
m
dim(m)
attributes(m)
m<-matrix(c(1,2,3,4,5,6,7,8,9),nrow = 3,ncol = 3,byrow = T)
m
x1<-c(1,2,3)
y1<-c(4,5,6)
c<-cbind(x1,y1)
c
r<-rbind(x1,y1)
r
c*2
m1<-matrix(c(11,12,13,14,15,16,17,18,19),nrow = 3,ncol = 3,byrow = TRUE)
m1
m
m+m1
m-m1
m*m1
m/m1
typeof(m)
t(m)
t(m1)
Creating list
l1<-list(1,22.3,"d",T,2+1i)
l1
typeof(l1)
class(l1)
x[is.na(x)]<-mean(x,na.rm = T)
x
x[is.na(x)]<-median(x,na.rm = T)
#Package Hmisc implments many imputation methods, few examples
library(Hmisc)
x = c(1,2,3,NA,4,4,NA)
# mean imputation - from package, mention name of function to be used
x <- impute(x, fun = mean)
x
#or
#median imputation
x <- impute(x, fun = median)
x
#Categorical Data ** ## Factors are variables in R which take on a limited number of di???erent values; such variables are
often referred to as categorical variables.
#Convert Character into Factor(categorical data)
# Create gender vector
gender_vector <- c("Male", "Female", "Female", "Male", "Male")
class(gender_vector)
# Convert gender_vector to a factor
factor_gender_vector <-factor(gender_vector)
class(factor_gender_vector)
# Create Ordinal categorical vector
day_vector <- c('evening', 'morning', 'afternoon', 'midday', 'midnight', 'evening')
# Convert `day_vector` to a factor with ordered level
factor_day <- factor(day_vector, order = TRUE, levels =c('morning', 'midday', 'afternoon', 'evening', 'midnight'))
# Print the new variable
factor_day
# Convert Numeric to Factor
# Creating vectors
age <- c(40, 49, 48, 40, 67, 52, 53)
salary <- c(103200, 106200, 150200, 10606, 10390, 14070, 10220)
gender <- c("male", "male", "transgender", "female", "male", "female", "transgender")
# Creating data frame named employee
employee<- data.frame(age, salary, gender)
set.seed(2)
x <- 1:10
class(x)
typeof(x)
g <- lm(y ~ x)
class(g)
typeof(g)
#using predict()
a<-data.frame(x=10)
a
pred<-predict(model,a)
pred
plot(model)
PRACTICAL N0-4
output
## $`1`
## [1] "bread" "butter" "eggs" "milk"
##
## $`2`
## [1] "beer" "bread" "cheese" "chips" "mayo" "soda"
##
## $`3`
## [1] "bread" "butter" "eggs" "milk" "oranges"
## $`4`
## [1] "bread" "butter" "eggs" "milk" "soda"
## $`5`
## [1] "buns" "chips" "beer" "mustard" "pickels" "soda"
##
## $`6`
## [1] "bread" "butter" "chocolate" "eggs" "milk"
# loading arules library
library(arules)
output
## Loading required package: Matrix ##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
rules = apriori(trans, parameter=list(support=0.5, confidence=0.9,maxlen=3,minlen=2))
output
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen ## 0.9 0.1
1 none FALSE TRUE 5 0.5 2
## maxlen target ext ## 3 rules
TRUE ##
## Algorithmic control:
## filter tree heap memopt load sort verbose ## 0.1 TRUE
TRUE FALSE TRUE 2 TRUE ##
## Absolute minimum support count: 7 ##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[15 item(s), 15 transaction(s)] done [0.00s]. ## sorting and
recoding items ... [4 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s]. ## checking
subsets of size 1 2 3
## Warning in apriori(trans, parameter = list(support = 0.5, confidence = 0.9, : ## Mining
stopped (maxlen reached). Only patterns up to a length of 3 returned!
## done [0.00s].
## writing ... [11 rule(s)] done [0.00s]. ## creating S4
object ... done [0.00s].
output
install.packages("stats")
library(stats)
library(dplyr)
library(ggplot2)
mydata<-select(iris,c(1,2,3,4))
model<-kmeans(mydata,3)
model
model$cluster
cluster(size)
model$size
table(model$cluster,iris$Species)
model$cluster <- as.factor(model$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = model$cluster))+ geom_point()
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Agglomerative
2.
head(iris)
#
###1 Se pal.L5e
.1ngth Se3p
.5al.W idth 1P
.4 etal.Len
0.2g tsh
etoP
saetal.Width Species
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(ggplot2)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point()
2.5
2.0
S
Petal.Width
1. p
5 e
c
i
e
s
1.
0 setos
a
versic
olor
virgini
ca
0.5
0.0
2 4 6
Petal.Length
## K-means clustering with 3 clusters of
sizes 52, 48, 50##
## Cluster means:
## Petal.Length
Petal.Width## 1
4.269231 1.342308
## 2 5.5958332.037500
## 3 1.462000
0.246000##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3333333333333333
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1111111111111111
## [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1111122222212222
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2222221222222222
## [149] 2 2
##
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167 2.02200
## (between_SS / total_SS = 94.3 %)
##
## Available components:
table(irisCluster$cluster, iris$Species)
##
## setosa versicolor virginica
## 1 0 48 4
## 2 0 2 46
## 3 50 0 0
2.5
2.0
i
Petal.Width
1. r
5 i
s
C
l
u
1. s
0 t
e
r
$
c
l
u
s
t
e
r
1
0.5
0.0
2 4 6
Petal.Length
#Agglomerative Clustering
{ r Agglomerative} head(iris) clusters <- hclust(dist(iris[, 3:4])) plot(clusters)
clusterCut
<- cutree(clusters, 3) table(clusterCut, iris$Species) clusters <-
hclust(dist(iris[,
3:4]), method = 'average') plot(clusters) clusterCut <- cutree(clusters, 3)
table(clusterCut, iris$Species) ggplot(iris, aes(Petal.Length, Petal.Width,
color = iris$Species)) +
geom_point
(alpha
= 0.4, size = 3.5) + geom_point(col = clusterCut) +
scale_color_manual(values
= c('black', 'red', 'green'))
1
PRACTICAL NO-6
Classification
Implementation and analysis of Classification algorithms like
1. Naive Bayesian,
2. K-Nearest Neighbor
Naive Bayes • Based on the Bayes theorem
• Predicts based on probabilities from training data
P(B|A) = P(A|B) P(B)/P(A)
Gives posterior probability of
‘B’ given ‘A’ usingprior
probability of ‘B’
prior probability of ‘A’
and conditional probability of ‘A’ given ‘B’
• Takes two step approach
– Calculates the posterior probability of the Class given the input - for every class
– Assigns the class with higher posterior probability
• More suited when dimensionality of input is high the - widely used for document classification
• Also good for the multiclass classifications
• Works well with less datasets also, but the assumption that predictor variables are independent should hold##Naive
# loading library e1071
library(e1071) library("klaR")
2
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
unique(iris$Species)
Sepal.Length
4.0
Sepal.Width
Petal.Length
7
5
3
Petal.Width 1
1.5 2.5
1 2 3 4 5 6 7
0.5
3
# training a naive Bayes model
index = sample(nrow(iris), floor(nrow(iris) * 0.7)) #70/30 split.
train =
iris[index,]test =
iris[-index,]
xTrain = train[,-5] # removing y-outcome variable.
yTrain = train$Species # only y.
xTest = test[,-5]
yTest =
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes:
94, 94, 96, 94, 95, 96, ...##
Resampling results across
tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9401515 0.9092949
## TRUE
0.940151
5 0.9092949
##
## Tuning parameter 'fL' was
held constant at a value of 0 ##
Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest
value.
## The final values used for the model were fL = 0,
## table() gives frequency table, prop.table() gives freq% table.
prop.table(table(predict(model$finalModel,xTest)$class,yTest))
usekernel = FALSE and adjust## = 1.
## yTest
## setosa versicolor virginica
## setosa 0.31111111 0.00000000 0.00000000
## versicolor 0.00000000 0.31111111 0.00000000
## virginica 0.00000000 0.04444444 0.333433333
K nearest
df <- data(iris) ##load data
head(iris) ## see the structure
Neighbour
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
##Generate a random number that is 90% of the total number of rows in dataset.
ran <- sample(1:nrow(iris), 0.9 * nrow(iris))
##the normalization function is created
nor <-function(x) { (x -min(x))/(max(x)-min(x)) }
##Run nomalization on first 4 coulumns of dataset because they are the predictors
iris_norm <- as.data.frame(lapply(iris[,c(1,2,3,4)], nor))
summary(iris_norm)
6
*********************Practical done in lab*********************************
# Naïve Bayes
install.packages("e1071")
View(iris)
ir=iris
train=ir[1:100,]
test=ir[101:150,]
test$Species
train$Species
pred=predict(model,test)
pred
table(pred)
table(test$Species)
table(train$Species)
ir1=ir[sample(nrow(ir)),]
train=ir1[1:100,]
test=ir1[101:150,]
pred=predict(model,test)
table(pred)
table(train$Species)
table(test$Species)
#KNN
table(iris$Species)
str(iris$Species)
7
head(iris)
#shuffle data
ir1=ir[sample(nrow(ir)),]
#check shuffling
head(ir1)
#create function
normalize<-function(x){
return((x-min(x))/(max(x)-min(x)))
iris_n<-as.data.frame(lapply(ir1[,c(1,2,3,4)], normalize))
str(iris_n)
iris_train<-iris_n[1:129,]
iris_test<-iris_n[130:150,]
iris_train_target<-iris[1:129,5]
#df<-as.data.frame(iris_train_target)
iris_test_target<-iris[130:150,5]
library(class)
#model<-knn(iris_train,iris_test,cl=df,k=13)
dim(iris_train)
dim(iris_test)
dim(df)
model<-knn(iris_train,iris_test,cl=iris_train_target,k=13)
table(iris_test_target,model)
8
PRACTICAL N0-7
Agglomerative clustering
#hierarichal clustering
#agglomerative
install.packages("hclust")
library(hclust)
#usarrest
df<-USArrests
#preprocessing
#remove na values
df<-na.omit(df)
#scale
d<-scale(df)
head(d)
d<-dist(d,method="euclidean")
hc<-hclust(d,method="complete")
plot(hc)
9
plot(hc,cex=0.1,hang=-1)
hcd=as.dendrogram(hc)
plot(hcd,type="triangle")
1
plot(cut(hcd,h=75)$upper)
plot(cut(hcd,h=75)$lower[[2]])
1
1