0% found this document useful (0 votes)
6 views

ADBMS Journal

This document is a certificate from St. Wilfred's College of Computer Science certifying that a student has successfully completed practical assignments in Advanced Database Management Systems for the academic year 2023-24. It includes an index of various practical implementations such as data partitioning, analytical queries, ETL transformations, and R programming basics. Additionally, it outlines specific SQL commands and examples related to the practical assignments.

Uploaded by

shabana ansari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views

ADBMS Journal

This document is a certificate from St. Wilfred's College of Computer Science certifying that a student has successfully completed practical assignments in Advanced Database Management Systems for the academic year 2023-24. It includes an index of various practical implementations such as data partitioning, analytical queries, ETL transformations, and R programming basics. Additionally, it outlines specific SQL commands and examples related to the practical assignments.

Uploaded by

shabana ansari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 100

St.

WILFRED’S COLLEGE OF COMPUTER SCIENCE


Bappa Sitaram Road, near Muncipal Garden, Sanghvi Nagar, Hatkesh, Mira Road East, Thane, Mira
Bhayandar, Maharashtra 401107

Date:

CERTIFICATE

This is to certify that Mr./Ms. _________________________


Roll No. ________ is a student of FYMCA Semester-I has completed
successfully full-semester practical/assignments of subject Advance
Database Management Systems for the academic year 2023 – 24.

Subject In-Charge Principal

External Examiner
INDEX

Sr. Title Sign


No.

1. Implementation of Data Partitioning


through range and list Partitioning

Implementation of Analytical Queries


2. 1)RollUp,
2) CUBE,
3) First ,
4)Last,
5)Lead,
6)Lag,
7) Rank
8)Dense Rank
Implementation of ORDBMS concept
3. ADT And REFERENCE

4. Implementation of ETL
transformation with pentaho like
Copy data from source
(table/excel/oracle) and store it to
target (table/excel/oracle),
adding sequence, calculator,
Concatition of two field, splitting of
two field, number range, string
operations, Sorting Data, merge Join,
Transformation on tables, data
validations on table data

5. Introduction to R programming and


Data Acquisition Install packages
,Loading packages, data types,
checking types of variable, printing
variables, and object , cdind-ing and
rbind-ingf reading and writing data.
Setwd(),getwd(),data(),rm(),Attachin
g and detaching data, reading data
from the console, loading data from
different data sources (CSV,Excel)

6. Implementation of Data preprocessing


techniques like naming and renaming
variables, adding a new variable, dealing
with missing data ,dealing with catalogical
data, data reduction using sub setting
Implementation and analysis of linear
7. regression through graphical method.
8. Implementation and analysis of classification
algorithms like naive Bayesian, k-nearest
neighbour,ID3,C4.5
9. Implementation and analysis of apriori
algorithm using market analysis

10. Implementation and analysis of clustering


algorithms like K-means, agglometrive
Practical No 1
Aim: Implementation of Data Partitioning through
range and list Partitioning.

create table employee1(emp_no number,emp_name


varchar(20),branch varchar(20),emp_sal
number)PARTITION by range(emp_sal)(PARTITION p1
values LESS than (6000),PARTITION p2 values LESS
than (9000),PARTITION p3 values LESS than (11999));

insert into employee1 values(1,'mayuri','mumbai',8000);


insert into employee1 values(2,'manasi','pune',5000);
insert into employee1 values(3,'belesh','thane',10000);
insert into employee1 values(4,'aniket','nagpur',4500);
insert into employee1 values(5,'vikram','agara',2500);
select * from employee1 PARTITION(p1);
select * from employee1 PARTITION(p2);
select * from employee1 PARTITION(p3);

select sum(emp_sal),emp_no from employee1 group by


emp_no
select * from employee1 order by emp_sal;

select * from employee1 order by emp_sal desc;

create table student1(name varchar(20),age number)PARTITION by range(age)(PARTITION a1 values


LESS than (20),PARTITION a2 values LESS than (30));

insert into student1 values('mayuri',20);


insert into student1 values('manasi',15);
insert into student1 values('belesh',25);
insert into student1 values('aniket',18);
insert into student1 values('vikram',29);
insert into student1 values('yash',12);
select * from student1 PARTITION(a1);

select * from student1 PARTITION(a2);


Practical No 2
Aim: Implementation of Analytical Queries like
RollUp,CUBE,First ,Last, Lead, Lag, Rank & Dense
Rank

select emp_name,branch,emp_sal,rank() over (order by


emp_sal)"Rank" from employee1;

select emp_name,branch,emp_sal,dense_rank() over (order


by emp_sal)"Rank" from employee1;
select
emp_name,emp_no,emp_sal,branch,min(emp_sal)keep(den
se_rank first order by emp_sal )over (partition
by branch)"lowest",max(emp_sal)keep(dense_rank last
order by emp_sal )over(partition by emp_no)"highest"
from employee1 order by emp_no,emp_sal;

select emp_name,branch,sum(emp_sal)
from employee1
GROUP BY cube(emp_name,branch)
ORDER BY emp_name,branch;
select emp_name,branch,sum(emp_sal)
from employee1
GROUP BY rollup(emp_name,branch)
ORDER BY emp_name,branch;

create table product


(
prdno varchar(20),
prodname varchar(25),
rate number,
qty_avilable number
)
partition by list(prdno)
(
partition pd1 values('p01','p02','p03','p04'),
partition pd2 values('a01','a02','a03','a04'),
partition pd3 values('b01','b02','b03','b04'),
partition pd4 values('c01','c02','c03','c04')
); create table product
(
prdno varchar(20),
prodname varchar(25),
rate number,
qty_avilable number
)
partition by list(prdno)
(
partition pd1 values('p01','p02','p03','p04'),
partition pd2 values('a01','a02','a03','a04'),
partition pd3 values('b01','b02','b03','b04'),
partition pd4 values('c01','c02','c03','c04')
);
insert into product values('b01','books',30,40);
select * from product partition(pd2);
select * from product partition(pd3);

Cricket

create table cricket1(name varchar(20),team varchar(20),run number)


insert into cricket1 values('mayuri','IND',120);
insert into cricket1 values('manasi','IND',150);
insert into cricket1 values('belesh','west indies',200);
insert into cricket1 values('aniket','australian',190);
insert into cricket1 values('vikram','london',100);
insert into cricket1 values('yash','australian',250);
insert into cricket1 values('rupesh','IND',250);
insert into cricket1 values('madhura','west indies',300);

select name,team,run,rank() over (order by run)"Rank" from cricket1;


select name,team,run,dense_rank() over (order by run)"DenseRank" from cricket1;

SQL> select name,team,run,min(run)keep(dense_rank first order by run)over(partition by team)"Lowest"


,max(run)keep(dense_rank last order by run)over(partition by team)"highest" from cricket1 order by
run,team;
create or replace type ptype as object(sizes number,color varchar(20),article varchar(20));

create table product1(prod_id number,prod_name ptype);

insert into product1 values(1,ptype(10,'red','adx123'));


insert into product1 values(2,ptype(30,'orange','rdg456'));
insert into product1 values(3,ptype(25,'blue','jhg576'));
insert into product1 values(4,ptype(40,'pink','ewd481'));

select p.prod_id,p.prod_name.sizes,p.prod_name.color,p.prod_name.article from product1 p;


select p.prod_id,p.prod_name.sizes from product1 p;

select p.prod_id,p.prod_name.color from product1 p;

select p.prod_id,p.prod_name.article from product1 p;


create table student(name varchar(20),age number,year varchar(20),marks number);

insert into student values('mayuri',21,'fymca',80);


insert into student values('manasi',25,'symca',85);
insert into student values('madhura',19,'fymca',75);
insert into student values('aniket',22,'tymca',90);
insert into student values('yash',24,'tymca',55);
insert into student values('rahul',27,'symca',64);
select name,age,year,marks,min(marks)keep(dense_rank first order by marks)over(partition by
year)"first",max(marks)keep(dense_rank last order by marks)over(partition by year)"last" from student order
by marks,year;
Practical No 3
Aim: Implementation of Abstract Datatype.

Q1) Create an ADT name_type with the attribute fname, mname and
lname to store the name details

Create a table customer with the attributes cust_no, cust_name of


name_type, product and price.
Display the first name of all the customers.

Display the details of customer who purchase the ‘Laptop’.


Q2) Create an date_type with the attribute day, month and year to store
the date details

Create a table ordertest with the attributes ord_no,


item,cust_name,order_date of date_type
Display the total number of orders dispatched in the year 2018.

Display the orders in the descending order of month and year.


Q3) Create abstract datatype of Book_type containing Bookno, Title,
Author, pub_date, price.
Create a table Books of Book_type.

Display the book details written by the author “Tanenbaum‟.

Display Books having a prices in the range of 1000 to 2500 and published
after January 2018
Q4) Create object table hostel of hostel_type(with attributeshost_no,
host_name and type(boys/girls))

Create a table Student with attributes sid, sname, gender, year, hostel_info
referencing object table.
Display the details of all girl students.

Display the students in hostel no 2.


Practical No 4
Aim: Implementation of ETL transformation with pentaho like Copy data from source
(table/excel/oracle) and store it to target (table/excel/oracle)

1. Open new transformation

2. Click on Input and drag and drop table input on the screen.
1. Double click on this table_input icon.

2. Click on new for creating a connection.


3. Click on test.
4. Click on OK. Then click on get SQL select statement.

Select your table and click on OK.


Then Click on Preview and then ok.
Below given page will get open
Click
on close and then ok.
Data Input is Done Now we have to create a table for getting our Output.
Go in Output and Drag & Drop table_output.

Make a Connection between them.


Double click on table output.
Give name to target table in which our input is going to get stored. Select
truncate table and specify database fields.
Click on SQL.
Click on execute then ok and close. Go in database fields and click on get fields

Click on OK.

And then Run.


Run

Click on Run.
It is successfully done. Now for checking our output go in SQL PLUS and type
the table name which you have given in table_output.
Aim: Implementation of ETL transformation with Pentaho like Adding sequence.
Number range
Input table

Preview data
Number Range transformation as follows

Run the transformation


Output

Sql output
Aim: Implementation of ETL transformation with Pentaho like calculator.
Calculator

Double click on table input


Get the Connection.

Click ok
Then go in SQL Select statement and select your table
Click ok
Then Preview->close->OK
Double Click on Calculator

Click OK
Double click on table Output.
Get all the Fields.
Execute-> OK
Then Run

Now Let’s see the output in SQL PLUS


Aim: Implementation of ETL transformation with Pentaho like Splitting of Two
fields.

Split
Click preview data

Click close
Select the transformation “SPLIT”
Add the changes as follows
Click ok
Output
Aim: Implementation of ETL transformation with Pentaho like Number Range.
Number range
Input table
Preview data
Number Range transformation as follows

Run the transformation


Output

Sql output
Aim: Implementation of ETL transformation with Pentaho like Sorting Data.
Sorting
Drag & Drop Input & output table, sort rows and add sequence from transformation.

Double click on Input table.


And create connection.

Select your table from Get SQL Select statement.


Click ok
Double click on sort rows.
Click on get fields.

Click ok
Double click on Add Sequence.
Click ok.
Double click on table output and give the name to your output table

click on
execute and then ok
And then Run.
Let’s see our output in SQL PLUS.

Aim: Implementation of ETL transformation with Pentaho like CSV File.


CSV file
From the input table select “CSV INPUT TABLE”
Make the following changes
Browse for a csv file
Output table

Run transformation
Output
Merge-join
1. Create new transformation and Drag & Drop data grid, sort and Merge join.

2. Double Click on Student data grid table. Enter Columns name.


Then Click on Data.

3. Add all the values in columns and click on preview then OK.

4. Double click on Marks grid table. Enter the columns name.


5. Add all the values in columns and click on preview then OK.

6. Double click on Stu_sort sort rows Add the column on which you want to perform sorting.

Click ok.
7. Double click on marks_sort sort rows Add the column on which you want to perform sorting.

8. Double Click on merge join and enter the following data.

NOTE: - We can perform inner join, left outer join, right outer join and full outer join by selecting join type.
9. Run the transformation.(Inner join)
10. Run the transformation.(Left Outer join)

11. Run the transformation (Right Outer Join)


12. Run the transformation (Full Outer Join)
Validation
1. Create new transformation and Drag & Drop data grid table, data validation, 2 Dummies and table output table.
Connect them with each other.

2. Double Click on Cust_validate data grid table.

Enter the entire Columns name.

3. Click on data and enter all the values.


Click ok
4. Double click on Cust_validation and add the felid on which you want to perform validation.

Click on new validation

Click ok then you can see the name of validation on left hand side corner double click on that.

Add on validation value


Click on ok.
5. Connect first dummy to main output of step connection.

6. Connect second dummy to Error handling of step connection.

7. Double Click on table_output and enter the following data.


8. Click on database fields.
9. Run the transformation.
PRACTICAL N0-1
Basics of R
Introduction to R
R was initially written by Robert Gentleman and Ross Ihaka—also known as “R & R” of the Statistics Department of the
University of Auckland.
R is a language and environment for statistical computing and graphics. It is a GNU project which is similar to the S
language and environment which was developed at Bell Laboratories (formerly AT&T, now Lucent Technologies) by John
Chambers and colleagues.
Comprehensive R Archive Network (CRAN) is a network of ftp and web servers around the world that store identical, up-
to-date, versions of code and documentation for R.
Installing R and RStudio
https://cran.r-project.org/
https://rstudio.com/products/rstudio/download/
Setting working directory
Setwd(“E:/R_practical”)
Getting working directory
Getwd(“E:/R_practical”)
Installing packages
Install.packages(“packagename”)
View packages
To list all available packages
Library()
To view complete list of datasets use: library (help="datasets")
Installed.packages()
Loading package
Library(packagename)
To list all dataset
Data()
To see structure
Str(nameofanything)

Creating Variables
X<-c (2.3,5.9,4.5,6.2,8.5) or
Assign (“X”, c(2.3,5.9,4.5,6.2,8.5) ) or
c (2.3,5.9,4.5,6.2,8.5) -> X
Printing a variable
X #auto printing
print(X) #external printing

Data Types and Objects


#integer a <-2L check the type of variable by typeof(a)
Length(a)
Attributes(a)- check metadata
#double b <-2
Note: By default R takes any number as double so to create an integer use L after the number.
#character c <-“Hi” or c <-‘Hi’
#complex d= 3 + 2i
#logical e <- T or e<- FALSE
Check whether a variable is double
Is.double(c)
Convert a variable from one type to other
As.character(c)
Everything we create in R is considered as objects. To list objects use command
Objects() or ls()
To remove any object use command
Rm(objectname)

Creating vector
#using c() function
X<-c(1,2,3,4,5)
#using vector() function
Y<-vector(1,2,3,4)
Doing calculation
#calculator
2+3
a<-4-3
a
2^2
5*5
5/6
log(2)
sqrt(4)
factorial(5)
exp(8)
mode(5)

x<-c(2,6,4,9)
y<-c(1,6,4)
x+y
y<-c(1,6,4,7)
x+y
x-y
x*y
x/y
Creating vector
#create vector
a<-c(1,2,3)
typeof(a)
aa<-c(12.3,34.6)
aa
(as<-c(1,2))
print(as)
al<-vector(logical,10)
al<-vector(mode = "logical",10)
al
Creating data frame
#data frame
student_id<-c(1,2,3)
student_name<-c("k","l","h")
df<-data.frame(student_id,student_name)
df
df$student_id
df$student_name
nrow(df)
ncol(df)
names(df)
Creating matrix
#matrix
m<-matrix(c(1,2,3,4,5,6,7,8,9),nrow = 3,ncol = 3)
m
dim(m)
attributes(m)
m<-matrix(c(1,2,3,4,5,6,7,8,9),nrow = 3,ncol = 3,byrow = T)
m
x1<-c(1,2,3)
y1<-c(4,5,6)
c<-cbind(x1,y1)
c
r<-rbind(x1,y1)
r
c*2
m1<-matrix(c(11,12,13,14,15,16,17,18,19),nrow = 3,ncol = 3,byrow = TRUE)
m1
m
m+m1
m-m1
m*m1
m/m1
typeof(m)
t(m)
t(m1)
Creating list
l1<-list(1,22.3,"d",T,2+1i)
l1
typeof(l1)
class(l1)

Loading data from different Data Source


Read csv file
dataT <- read.table("na_data.csv", sep =",", header = T)
dataT <- read.csv("marks.csv", sep =",", header = T)
Create a table
#create a matrix
smoke <- matrix(c(51,43,22,92,28,21,68,22,9),ncol=3,byrow=TRUE)
smoke
# give column names
colnames(smoke) <- c("High","Low","Middle")
rownames(smoke) <- c("current","former","never")
smoke <- as.table(smoke)
smoke
View(smoke)

Read Excel file


Method 1
install.packages("XLConnect")
library("XLConnect")
xlsample<-XLConnect::readWorksheetFromFile("Book1.xlsx",sheet=1)
data1<-xlsample[1:2,]
data1
Method 2
install.packages("readxl")
install.packages("writexl")
library("readxl")
library(writexl)
data2<-read_excel("Book1.xlsx",sheet = 1)
write_xlsx(data1,"samplexl.xlsx")
then check file samplexl.xlsx
PRACTICAL NO-2
Data Preprocessing Techniques
Aim: Implementation of Data preprocessing techniques like,
1. Naming and Renaming variables, adding a new variable.
2. Dealing with missing data.
3. Data reduction using subsetting
4. Dealing with categorical data.

1. Naming and Renaming variables, adding a new variable.


setwd("D:/R")
my_data<-mtcars
head(mtcars,5)
my_data<-my_data[1:6,1:5]
#renaming a column with dplyr::rename()
require(dplyr)
my_data<-rename(my_data,horse_power=hp)
# adding new variable
my_data$new_hp<-my_data$horse_power*0.5
colnames(my_data)
data <- read.table(file="D:/R/missing_col.csv", sep = ",")
data <- read.table(file="D:/R/missing_col.csv", sep =
",",col.names=c("Sno","NAME","SALARY","Date_Of_Joining","Department"))
data
output:
Sno NAME SALARY Date_Of_Joining Department
1 1 Rick 623.30 01/01/2012 IT
2 2 Dan 515.20 23/09/2013 Operations
3 3 Michelle 611.00 15/11/2014 IT
4 4 Ryan 729.00 11/05/2014 HR
5 NA Gary 843.25 27/03/2015 Finance
6 6 Nina NA 21/05/2013 IT
7 7 Simon 632.80 30/07/2013 Operations
8 8 Guru 722.50 17/06/2014 Finance
9 9 John NA 21/05/2012
10 10 Rock 600.80 30/07/2013 HR
11 11 Brad 1032.80 30/07/2013 Operations
12 12 Ryan 729.00 11/05/2014 HR
2. Dealing with missing data
Error Detection and Correction NA: Not Available - Known as missing values
Works as a place holder for something that is ‘missing’
Most basic operations (addition, subtraction, multiplication, etc.) in R deal with it without crashing and return NA if one of
the inputs is NA
is.na(VALUE) is used to check if the input value is NA or not.
Returns a TRUE/FALSE vector whereas in case of Excel like utilities for numeric computations it’s assumed to be 0
Using na.rm(): This will keep NA rows in data while removes them during calculation
e.g. V <- c(1,2,NA,3) median(V, na.rm = T)
Data reduction using subsetting
complete.cases()
Return a logical vector indicating which cases are complete, i.e., have no missing values.
Syntax: complete.cases(…)
It takes a sequence of vectors, matrices and data frames as arguments.
E.g. V <- c (1,2,NA,3)
V [complete.cases(V)] [1] 1 2 3
is.na() logical indexing: indicates which elements are missing
e.g. naVals <- is.na(V) V[!naVals] [1] 1 2 3
Detect if there are any NAs: any(is.na(datan))
Identify positions of NAs: which(is.na(datan$v1))
Imputation: The process of estimating or deriving missing values
Missing data imputation is a statistical method that replaces missing data points with substituted values.
There are various methods for imputation
– Imputation of the mean – Imputation of the median – Imputation using linear regression models
• Package Hmisc implments many imputation methods
library(Hmisc)
x = c(1,2,3,NA,4,4,NA)
# mean imputation - from package, mention name of function to be used
x <- impute(x, fun = mean)
x
#median imputation
x <- impute(x, fun = median)
x
Exercise
1. How many missing values are in the built-in data set airquality?
2. Which variables are the missing values concentrated in?
3. How would you impute the mean or median for these values?
4. How would you omit all rows containing missing values?
Answers
#airquality
View(airquality)
#1
sum(is.na(airquality))
df1<-airquality
#2
colSums(is.na(df1))
#3
df1$Ozone[is.na(df1$Ozone)]<-mean(df1$Ozone,na.rm = T)
df1
#4
na.omit(df1)

3. Dealing with categorical data.


c1<-c("low","medium","high","medium","low")
c1<-factor(c1,levels =c("low","medium","high"))
c1

************ Practical done in lab *******************


setwd("D:/R")
my_data<-mtcars
head(mtcars,5)
my_data<-my_data[1:6,1:5]
#renaming a column with dplyr::rename()
require(dplyr)
my_data<-rename(my_data,horse_power=hp)
# adding new variable
my_data$new_hp<-my_data$horse_power*0.5
colnames(my_data)
data <- read.table(file="D:/R/missing_col.csv", sep = ",")
data <- read.table(file="D:/R/missing_col.csv", sep =
",",col.names=c("Sno","NAME","SALARY","Date_Of_Joining","Department"))
data
#Error Detection and Correction
#Error Detection and Correction NA: Not Available - Known as missing values
#Works as a place holder for something that is 'missing'
#Most basic operations (addition, subtraction, multiplication, etc.) in R deal with it without crashing and return NA if one
of the inputs is NA
#is.na(VALUE) is used to check if the input value is NA or not.
#Returns a TRUE/FALSE vector whereas in case of Excel like utilities for numeric computations it's assumed to be 0
# Operation with NA
NA + 4
#Create a vector V with 1 NA value
V <- c(1,2,NA,3)
# Median with and without NA (remove NA)
median(V)
# On removing NAs
median(V, na.rm = T)
# is.na(): indicates which elements are missing.
# Apply is.na() to vector
is.na(V)
# Removing the NA values by using logical indexing
naVals <- is.na(V)
# Get values that are not NA
V[!naVals]
#Detect if there are any NAs
any(is.na(data))
#Identify positions of NAs
which(is.na(data$SALARY))

#is.na() in data frame


# data frame with missing data
df <- data.frame(col1 = c(1:3, NA),
col2 = c("this", NA,"is", "text"),
col3 = c(TRUE, FALSE, TRUE, TRUE),
col4 = c(2.5, 4.2, 3.2, NA))
is.na(df)
is.na(df$col4)
# identify location of NAs
which(is.na(df))
# identify count of NAs
sum(is.na(df))
#compute the total missing values in each column
colSums(is.na(df))

#record missing values


# Subsetting with complete cases - values that are not NA
#complete.cases in a vector
complete.cases(V)
V[complete.cases(V)]
# Subsetting a data frame with complete cases
complete.cases(df)
# subset with complete.cases to get complete cases
df[complete.cases(df), ]
# or subset with `!` operator to get incomplete cases
df[!complete.cases(df), ]
datan <- read.table(file="D:/R/missing_col.csv",sep=",",na.strings = "")
dataCompleteCases <- datan[complete.cases(datan),]

#na.omit() to omit all rows containing missing values.


na.omit(df)

#Imputation The process of estimating or deriving missing values


#There are various methods for imputation
#- Imputation of the mean - Imputation of the median - Imputation using linear regression models

x[is.na(x)]<-mean(x,na.rm = T)
x
x[is.na(x)]<-median(x,na.rm = T)
#Package Hmisc implments many imputation methods, few examples
library(Hmisc)
x = c(1,2,3,NA,4,4,NA)
# mean imputation - from package, mention name of function to be used
x <- impute(x, fun = mean)
x
#or

#median imputation
x <- impute(x, fun = median)
x
#Categorical Data ** ## Factors are variables in R which take on a limited number of di???erent values; such variables are
often referred to as categorical variables.
#Convert Character into Factor(categorical data)
# Create gender vector
gender_vector <- c("Male", "Female", "Female", "Male", "Male")
class(gender_vector)
# Convert gender_vector to a factor
factor_gender_vector <-factor(gender_vector)
class(factor_gender_vector)
# Create Ordinal categorical vector
day_vector <- c('evening', 'morning', 'afternoon', 'midday', 'midnight', 'evening')
# Convert `day_vector` to a factor with ordered level
factor_day <- factor(day_vector, order = TRUE, levels =c('morning', 'midday', 'afternoon', 'evening', 'midnight'))
# Print the new variable
factor_day
# Convert Numeric to Factor
# Creating vectors
age <- c(40, 49, 48, 40, 67, 52, 53)
salary <- c(103200, 106200, 150200, 10606, 10390, 14070, 10220)
gender <- c("male", "male", "transgender", "female", "male", "female", "transgender")
# Creating data frame named employee
employee<- data.frame(age, salary, gender)

# Creating a factor corresponding to age with labels


wfact = cut(employee$age, 3, labels=c('Young', 'Medium', 'Aged'))
table(wfact)
#homework
#airquality
View(airquality)
df1<-airquality
df1$Ozone[is.na(df1$Ozone)]<-mean(df1$Ozone,na.rm = T)
df1
PRACTICAL NO-3
Linear Model

set.seed(2)
x <- 1:10
class(x)
typeof(x)

y <- x/5 + rnorm(10)


class(y)
typeof(y)

g <- lm(y ~ x)
class(g)
typeof(g)

class(g) <- "myLM"


print.myLM <- function(x){
print(x$coefficients)
}
g

**************** practical done in R *****************************


#Implementation and analysis of Linear regression through graphical methods including Plots
# steps in LR
# 1. create relationship model using lm()
# 2.find coefficients from model created and create mathematical equation
# 3. get summary to know average error(residual) in prediction
# 4. use predict() for prediction of new tuple
#simple Linear Regression
#Predictor variable
#employee experience
x<-c(3,8,9,13,3,6,11,21,1,16)
#age
z<-c(1,2,3,4,5,6,7,8,9,10)
#response variable
#salary
y<-c(30,57,64,72,36,43,59,90,20,83)
library(ggplot2)
plot(x,y,col='red', main ="scatter plot")

# linear regression model


model=lm(y~x+z)
model
attributes(model)
coef(model)
residuals(model)
summary(model)
abline(model)
#predicting values manually y=a+bx+cz
x10<-model$coefficients[[1]]+model$coefficients[[2]]*10+model$coefficients[[3]]*5
x10

#using predict()
a<-data.frame(x=10)
a
pred<-predict(model,a)
pred

plot(model)
PRACTICAL N0-4

Implementation and analysis of Apriori Algorithm using Market Basket Analysis.


# we are creating a data frame by importing csv file
mba_data<-read.csv("data_apriori.csv")
# you can check top 6 observation using head() function
trans <- split(mba_data$Products, mba_data$Customer_Id,"transactions")
head(trans)

output

## $`1`
## [1] "bread" "butter" "eggs" "milk"
##
## $`2`
## [1] "beer" "bread" "cheese" "chips" "mayo" "soda"
##
## $`3`
## [1] "bread" "butter" "eggs" "milk" "oranges"

## $`4`
## [1] "bread" "butter" "eggs" "milk" "soda"

## $`5`
## [1] "buns" "chips" "beer" "mustard" "pickels" "soda"
##
## $`6`
## [1] "bread" "butter" "chocolate" "eggs" "milk"
# loading arules library
library(arules)

output
## Loading required package: Matrix ##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
rules = apriori(trans, parameter=list(support=0.5, confidence=0.9,maxlen=3,minlen=2))

output

## Warning in asMethod(object): removing duplicated items in transactions ## Apriori

##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen ## 0.9 0.1
1 none FALSE TRUE 5 0.5 2
## maxlen target ext ## 3 rules
TRUE ##
## Algorithmic control:
## filter tree heap memopt load sort verbose ## 0.1 TRUE
TRUE FALSE TRUE 2 TRUE ##
## Absolute minimum support count: 7 ##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[15 item(s), 15 transaction(s)] done [0.00s]. ## sorting and
recoding items ... [4 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s]. ## checking
subsets of size 1 2 3
## Warning in apriori(trans, parameter = list(support = 0.5, confidence = 0.9, : ## Mining
stopped (maxlen reached). Only patterns up to a length of 3 returned!
## done [0.00s].
## writing ... [11 rule(s)] done [0.00s]. ## creating S4
object ... done [0.00s].

output

## lhs rhs supportconfidence coverage lift count


## [1] {eggs} => {milk} 0.6000000 1 0.6000000 1.666667 9
## [2] {milk} => {eggs} 0.6000000 1 0.6000000 1.666667 9
## [3] {butter} => {bread} 0.6000000 1 0.6000000 1.250000 9
## [4] {butter,eggs} => {milk} 0.5333333 1 0.5333333 1.666667 8
## [5] {butter,milk} => {eggs} 0.5333333 1 0.5333333 1.666667 8
## [6] {bread,eggs} => {milk} 0.5333333 1 0.5333333 1.666667 8
## [7] {bread,milk} => {eggs} 0.5333333 1 0.5333333 1.666667 8
## [8] {butter,eggs} => {bread} 0.5333333 1 0.5333333 1.250000 8
## [9] {bread,eggs} => {butter} 0.5333333 1 0.5333333 1.666667 8
## [10] {butter,milk} => {bread} 0.5333333 1 0.5333333 1.250000 8
## [11] {bread,milk} => {butter} 0.5333333 1 0.5333333 1.666667 8
PRACTICAL N0-5
///////////////////////////////////Practical done in lab//////////////////////

install.packages("stats")
library(stats)
library(dplyr)
library(ggplot2)
mydata<-select(iris,c(1,2,3,4))
model<-kmeans(mydata,3)
model
model$cluster
cluster(size)
model$size
table(model$cluster,iris$Species)
model$cluster <- as.factor(model$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = model$cluster))+ geom_point()

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

Implementation and analysis of clustering algorithms like


1. K-Means

Agglomerative
2.
head(iris)

#
###1 Se pal.L5e
.1ngth Se3p
.5al.W idth 1P
.4 etal.Len
0.2g tsh
etoP
saetal.Width Species
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(ggplot2)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point()
2.5

2.0

S
Petal.Width

1. p
5 e
c
i
e
s
1.
0 setos
a
versic
olor
virgini
ca

0.5

0.0
2 4 6
Petal.Length
## K-means clustering with 3 clusters of
sizes 52, 48, 50##
## Cluster means:
## Petal.Length
Petal.Width## 1
4.269231 1.342308
## 2 5.5958332.037500
## 3 1.462000
0.246000##

## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3333333333333333
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1111111111111111
## [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1111122222212222
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2222221222222222
## [149] 2 2
##
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167 2.02200
## (between_SS / total_SS = 94.3 %)
##
## Available components:
table(irisCluster$cluster, iris$Species)
##
## setosa versicolor virginica
## 1 0 48 4
## 2 0 2 46
## 3 50 0 0

irisCluster$cluster <- as.factor(irisCluster$cluster)


ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point()

2.5

2.0

i
Petal.Width

1. r
5 i
s
C
l
u
1. s
0 t
e
r
$
c
l
u
s
t
e
r
1
0.5

0.0
2 4 6
Petal.Length

#Agglomerative Clustering
{ r Agglomerative} head(iris) clusters <- hclust(dist(iris[, 3:4])) plot(clusters)
clusterCut
<- cutree(clusters, 3) table(clusterCut, iris$Species) clusters <-
hclust(dist(iris[,
3:4]), method = 'average') plot(clusters) clusterCut <- cutree(clusters, 3)
table(clusterCut, iris$Species) ggplot(iris, aes(Petal.Length, Petal.Width,
color = iris$Species)) +
geom_point
(alpha
= 0.4, size = 3.5) + geom_point(col = clusterCut) +
scale_color_manual(values
= c('black', 'red', 'green'))

1
PRACTICAL NO-6

Classification
Implementation and analysis of Classification algorithms like
1. Naive Bayesian,
2. K-Nearest Neighbor
Naive Bayes • Based on the Bayes theorem
• Predicts based on probabilities from training data
P(B|A) = P(A|B) P(B)/P(A)
Gives posterior probability of
‘B’ given ‘A’ usingprior
probability of ‘B’
prior probability of ‘A’
and conditional probability of ‘A’ given ‘B’
• Takes two step approach
– Calculates the posterior probability of the Class given the input - for every class
– Assigns the class with higher posterior probability
• More suited when dimensionality of input is high the - widely used for document classification
• Also good for the multiclass classifications
• Works well with less datasets also, but the assumption that predictor variables are independent should hold##Naive
# loading library e1071
library(e1071) library("klaR")

Bayes ## Loading required package: MASS


library("caret")

## Loading required package:


lattice## Loading required
library(ggplot2)#
iris dataset ggplot2
package:
data(iris)
head(iris)

## Sepal.Length Sepal.Width Petal.Length Petal.Width Species


## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa

2
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
unique(iris$Species)

## [1] setosa versicolor


#Plot graph

pairs(iris[1:4], main="Iris Data


(red=setosa,green=versicolor,blue=virginica)",pch=21,
bg=c("red","green3","blue")[unclass(iris$Species)])
virginica## Levels: setosa
versicolor virginica

Iris Data (red=setosa,green=versicolor,blue=virginica)


2.0 3.0 4.0 0.5 1.5 2.5

Sepal.Length
4.0

Sepal.Width

Petal.Length

7
5
3
Petal.Width 1
1.5 2.5

1 2 3 4 5 6 7
0.5

4.5 5.5 6.5 7.5

3
# training a naive Bayes model
index = sample(nrow(iris), floor(nrow(iris) * 0.7)) #70/30 split.
train =
iris[index,]test =
iris[-index,]
xTrain = train[,-5] # removing y-outcome variable.
yTrain = train$Species # only y.

xTest = test[,-5]
yTest =

4.5 6.0 7.5


## Naive Bayes##
## 105 samples
## 4 predictor
## 3 classes: 'setosa',
'versicolor', 'virginica'
##
2.0 3.0

## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes:
94, 94, 96, 94, 95, 96, ...##
Resampling results across
tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9401515 0.9092949
## TRUE
0.940151
5 0.9092949
##
## Tuning parameter 'fL' was
held constant at a value of 0 ##
Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest
value.
## The final values used for the model were fL = 0,
## table() gives frequency table, prop.table() gives freq% table.
prop.table(table(predict(model$finalModel,xTest)$class,yTest))
usekernel = FALSE and adjust## = 1.
## yTest
## setosa versicolor virginica
## setosa 0.31111111 0.00000000 0.00000000
## versicolor 0.00000000 0.31111111 0.00000000
## virginica 0.00000000 0.04444444 0.333433333
K nearest
df <- data(iris) ##load data
head(iris) ## see the structure
Neighbour
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
##Generate a random number that is 90% of the total number of rows in dataset.
ran <- sample(1:nrow(iris), 0.9 * nrow(iris))
##the normalization function is created
nor <-function(x) { (x -min(x))/(max(x)-min(x)) }

##Run nomalization on first 4 coulumns of dataset because they are the predictors
iris_norm <- as.data.frame(lapply(iris[,c(1,2,3,4)], nor))

summary(iris_norm)

## Sepal.Length Sepal.Width Petal.Length Petal.Width


## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000

## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333


## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean
:0.4675 Mean :0.45806## 3rd Qu.:0.5833
3rd Qu.:0.5417 3rd Qu.:0.6949 3rd
Qu.:0.70833 ## Max. :1.0000 Max.
:1.0000 Max. :1.0000 Max. :1.00000
##extract
training set
iris_train <-
iris_norm[ran,
]##extract
testing set
iris_test <-
iris_norm[-
ran,]
##extract 5th column of train dataset because it will be used as ’cl’ argument in
knn function.
iris_target_category <- iris[ran,5]
##extract 5th column if test dataset to measure the accuracy
iris_test_category
<- iris[-ran,5]
##load the
package class
library(class)
##run knn function
5
pr <- knn(iris_train,iris_test,cl=iris_target_category,k=13)
##create confusion matrix
tab <- table(pr,iris_test_category)
##this function divides the correct predictions by total number of predictions that
tell us how accura

accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}


a

6
*********************Practical done in lab*********************************

# Naïve Bayes

install.packages("e1071")

library("e1071") # contain naive bayes classifier

#iris data set

View(iris)

ir=iris

train=ir[1:100,]

test=ir[101:150,]

model=naiveBayes(Species ~.,data = train)

test$Species

train$Species

pred=predict(model,test)

pred

table(pred)

table(test$Species)

table(train$Species)

##############shuffle iris file###########################

ir1=ir[sample(nrow(ir)),]

train=ir1[1:100,]

test=ir1[101:150,]

model=naiveBayes(Species ~. , data = train)

pred=predict(model,test)

table(pred)

table(train$Species)

table(test$Species)

#KNN

table(iris$Species)
str(iris$Species)
7
head(iris)
#shuffle data

ir1=ir[sample(nrow(ir)),]

#check shuffling

head(ir1)

#scale data using normalization

#create function

normalize<-function(x){

return((x-min(x))/(max(x)-min(x)))

#normalize iris data

iris_n<-as.data.frame(lapply(ir1[,c(1,2,3,4)], normalize))

str(iris_n)

#create train dataset and test dataset

iris_train<-iris_n[1:129,]

iris_test<-iris_n[130:150,]

iris_train_target<-iris[1:129,5]

#df<-as.data.frame(iris_train_target)

iris_test_target<-iris[130:150,5]

library(class)

#model<-knn(iris_train,iris_test,cl=df,k=13)

# check the dimension of the data set

dim(iris_train)

dim(iris_test)

dim(df)

model<-knn(iris_train,iris_test,cl=iris_train_target,k=13)

table(iris_test_target,model)

8
PRACTICAL N0-7
Agglomerative clustering
#hierarichal clustering
#agglomerative
install.packages("hclust")
library(hclust)
#usarrest
df<-USArrests
#preprocessing
#remove na values
df<-na.omit(df)
#scale
d<-scale(df)
head(d)

d<-dist(d,method="euclidean")
hc<-hclust(d,method="complete")
plot(hc)

9
plot(hc,cex=0.1,hang=-1)

hcd=as.dendrogram(hc)
plot(hcd,type="triangle")

1
plot(cut(hcd,h=75)$upper)

plot(cut(hcd,h=75)$lower[[2]])

1
1

You might also like