#PART 1a) : "Vqv/ggbiplot"
#PART 1a) : "Vqv/ggbiplot"
#PART 1a) : "Vqv/ggbiplot"
#PART 1a)
require(ggplot2)
library(devtools)
install_github("vqv/ggbiplot")
## Skipping install of ’ggbiplot’ from a github remote, the SHA1 (7325e880) has not changed since last i
## Use ‘force = TRUE‘ to force installation
require(ggbiplot)
dim(mpg)
## [1] 234 11
hwymedian=setNames(aggregate(mpg$hwy,list(mpg$manufacturer),median),c("manufacturer","hwy"))
ggplot(hwymedian,aes(x=reorder(manufacturer, -hwy), y=hwy))+geom_point()+ylab("Miles Per Gallon(hwy)")+x
1
Fuel Efficiency vs Manufacturer
30
Miles Per Gallon(hwy)
25
20
honda
volkswagen
hyundai audi nissanpontiacsubaru toyotachevrolet jeep ford mercurydodge lincolnland rover
Manufacturers
#WE can see from the plot that most fuel efficient manufacturer is HONDA and least is LAND-ROVER as it l
(b). Write code that displays a graph which plots in the order of decreasing medians of the vehicle’s miles-
per-gallon on highway (hwy) against the type of car (class). Plot the graph and list the classes of vehicle in
the order of their fuel efficiency.
#PART 1b)
classmedian=setNames(aggregate(mpg$hwy,list(mpg$class),median),c("class","hwy"))
ggplot(classmedian,aes(x=reorder(class, -hwy), y=hwy))+geom_point()+ylab("Miles Per Gallon(hwy)")+xlab(
2
Fuel Efficiency vs Class
27.5
25.0
Miles Per Gallon(hwy)
22.5
20.0
17.5
# most efficient class is compact and midsize while the least efficient is SUV and pickup
(c). Draw a bar chart of manufacturers in terms of numbers of different types of cars manufactured. Based
on this, comment on classes of vehicles manufactured by the companies producing the most and the least
fuel efficient vehicles and possible reason(s) for highest/lowest fuel efficiency.
# PART 1c)
ggplot(mpg,
aes(x = manufacturer)) + geom_col(aes(y=hwy,fill=class))
3
800
class
600
2seater
compact
midsize
hwy
400 minivan
pickup
subcompact
suv
200
audi
chevrolet
dodge ford honda
hyundaijeep
land rover
lincoln
mercury
nissan
pontiac
subarutoyota
volkswagen
manufacturer
#from the barchart we can see that compact and midsize class of car is fuel efficient while SUV and pick
Exercise 2. The diamonds dataset within R’s ggplot2 contains 10 columns (price, carat, cut, color, clarity,
length(x), width(y), depth(z), depth percentage, top width) for 53940 different diamonds. Using this dataset,
carry out the following tasks.
(a). Write code to plot histograms for carat and price. Plot these graphs and comment on their shapes.
#PART 2a)
library(ggplot2)
ggplot(data = diamonds, aes(x = price)) +
geom_histogram(binwidth = 500,colour="blue") + xlab('Price') +
ylab('Frequency')
4
10000
7500
Frequency
5000
2500
5
10000
Frequency
5000
0 1 2 3 4 5
Carat
(b). Write code to plot bar charts of cut proportioned in terms of color and again bar charts of cuts
proportioned in terms of clarity. Comment on how proportions of diamonds change in terms of clarity and
colour under different cut categories.
#PART 2b)
ggplot(diamonds,
aes(x = clarity,
fill =cut)) +
geom_bar(position = "fill")+ylab("Proportions")
6
1.00
0.75
cut
Fair
Proportions
Good
0.50
Very Good
Premium
Ideal
0.25
0.00
ggplot(diamonds,
aes(x = color,
fill = cut)) +
geom_bar(position = "fill")+ylab("Proportions")
7
1.00
0.75
cut
Fair
Proportions
Good
0.50
Very Good
Premium
Ideal
0.25
0.00
D E F G H I J
color
#According to the charts we can see that fair cut diamonds are in clarity l1 in highest proportion and m
#on the basis of color we can see that all color category have equal proportion of diamonds cutting
(c). Write code to display an appropriate graph that facilitates the investigation of a three-way relationship
between cut, carat and price. Plot the graph. What inferences can you draw regarding the three way
relationship?
#PART 2c)
8
15000
Cut
Fair
Good
price
10000
Very Good
Premium
Ideal
5000
0
0 1 2 3 4 5
carat
# from the plot we can see that most of the diamonds cut ideal There is not much variation on cut . the
Exercise 3. Before deciding about selecting a particular machine learning technique for a data science prob-
lem, it is important to study the data distribution particularly through visualization. However, visualizing a
multivariate data with two or more variables is difficult in a two dimensional plot. In this exercise, you are
required to study the R’s iris dataset which is a multivariate data consisting of four features or properties
(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) characterizing three species of iris flower (setosa,
versicolor, and virginica). The principal component analysis (PCA) is a technique that can help facilitate
visualization of a multivariate data distribution. The first two principal components (PC1 and PC2) ob-
tained after applying PCA, can explain the majority of variations in the data. In order to study the data
variability in iris data-set, perform the following tasks.
(a). Write code to obtain PC scores.
#PART 3a)
library(ggplot2)
log.ir <- log(iris[, 1:4])
ir.species <- iris[, 5]
pcairis <- prcomp(log.ir,
center = TRUE,
scale. = TRUE)
print(pcairis)
9
## [1] 1.7124583 0.9523797 0.3647029 0.1656840
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Sepal.Length 0.5038236 -0.45499872 0.7088547 0.19147575
## Sepal.Width -0.3023682 -0.88914419 -0.3311628 -0.09125405
## Petal.Length 0.5767881 -0.03378802 -0.2192793 -0.78618732
## Petal.Width 0.5674952 -0.03545628 -0.5829003 0.58044745
(b). Write code to obtain a scatter plot representing PC1 vs. PC2, wherein data clusters corresponding to
three flower types are clearly marked using possibly an ellipsoid.
#PART 3b
ggbiplot(pcairis,choice=c(1,2), groups=iris$Species, ellipse=TRUE,
scale=0,var.scale=0.2, colour="blue", varname.size=3)+
ggtitle("PCA visualization",subtitle="ggbiplot()") +
theme(plot.title =element_text(size=15, face="bold", hjust=0.5,
colour = "red"),plot.subtitle =element_text(size=10,
face="bold.italic", hjus
PCA visualization
ggbiplot()
2
PC2 (22.7% explained var.)
1 groups
setosa
0 versicolor
Petal
Pe tal..W
Leidt
nghth
virginica
−1 Se
pa
l.L
en
gt
h
−2
idth
al.W
Sep
−2 0 2
PC1 (73.3% explained var.)
(c). Run the codes to make the scatter plot, mark flowers using ellipsoids and comment on the feature
distribution.
#PART 3c
10
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, colour=Species)) +
geom_point() +
stat_ellipse()
Species
Petal.Width
setosa
versicolor
virginica
1
2 4 6
Petal.Length
Exercise 4. In this task, you are required to analyze the Animals dataset from the MASS package.This
dataset contains brain weight (in grams) and body weight (in kilograms) for 28 different animal species.The
three largest animals are dinosaurs, whose measurements are obviously the result of scientific modeling rather
than precise measurements.
A scatter plot given below fails to describe any obvious relationship between brain weight and body weight
variables. You are required to apply appropriate power transformations to the variables to obtain more
interpretable plot and describe the obtained relationship. To this end, undertake the following tasks.
library(ggplot2)
library(MASS)
data(Animals)
qplot(brain, body, data = Animals)
11
75000
50000
body
25000
0 2000 4000
brain
Task-1. Check whether each of the variables has normal distribution. Your response should be based on an
appropriate statistical test as well as smoothed histogram plots.
##
## Shapiro-Wilk normality test
##
## data: Animals$body
## W = 0.27831, p-value = 1.115e-10
##
## Shapiro-Wilk normality test
##
## data: Animals$brain
## W = 0.45173, p-value = 3.763e-09
#USing density plot and checking the difference between median and mean
plot(density(Animals$body))
12
density.default(x = Animals$body)
0.0012
0.0008
Density
0.0004
0.0000
N = 28 Bandwidth = 164.1
plot(density(Animals$brain))
13
density.default(x = Animals$brain)
0.0015
0.0010
Density
0.0005
0.0000
N = 28 Bandwidth = 137.2
hist(Animals$brain,30)
14
Histogram of Animals$brain
15
Frequency
10
5
0
Animals$brain
hist(Animals$body,30)
15
Histogram of Animals$body
20
15
Frequency
10
5
0
Animals$body
#through density plot we can see there is big difference in mean and median which shows that data is inc
Task-2. A power transformation of a variable X consists of raising X to the power lambda. Using an
appropriate statistical test and/or plot, find best lambda values needed for transforming each of the variables
requiring power transformation.
plot(density(brain_sqrt))
16
density.default(x = brain_sqrt)
0.03
0.02
Density
0.01
0.00
−20 0 20 40 60 80
N = 28 Bandwidth = 5.46
plot(density(brain_cub))
17
density.default(x = brain_cub)
0.10
0.08
0.06
Density
0.04
0.02
0.00
0 10 20 30 40 50
N = 28 Bandwidth = 2.196
plot(density(brain_log))
18
density.default(x = brain_log)
0.15
Density
0.10
0.05
0.00
0 5 10
N = 28 Bandwidth = 1.03
plot(density(body_sqrt))
19
density.default(x = body_sqrt)
0.030
0.020
Density
0.010
0.000
N = 28 Bandwidth = 6.94
plot(density(body_cub))
20
density.default(x = body_cub)
0.10
0.08
0.06
Density
0.04
0.02
0.00
0 10 20 30 40 50
N = 28 Bandwidth = 2.196
plot(density(body_log))
21
density.default(x = body_log)
0.00 0.02 0.04 0.06 0.08 0.10
Density
−10 −5 0 5 10 15
N = 28 Bandwidth = 1.74
#our results shows that best transformation is LOG . We will confirm this with boxcox transformation.
animal_body=lm(body~.,data=Animals)
animal_brain=lm(brain~.,data=Animals)
boxcox(animal_body,plotit = TRUE)
22
95%
−100
log−Likelihood
−200
−300
−400
−2 −1 0 1 2
boxcox(animal_brain,plotit = TRUE)
23
95%
−100
log−Likelihood
−150
−200
−250
−2 −1 0 1 2
Task-3. Apply power transformation and verify whether transformed variables have a normal distribution
through statistical test as well as smoothed histogram plots.
##
## Shapiro-Wilk normality test
##
## data: Animals$brain
## W = 0.95787, p-value = 0.31
##
## Shapiro-Wilk normality test
##
## data: Animals$body
## W = 0.98465, p-value = 0.9433
24
plot(density(Animals$brain))
density.default(x = Animals$brain)
0.15
Density
0.10
0.05
0.00
0 5 10
N = 28 Bandwidth = 1.03
plot(density(Animals$body))
25
density.default(x = Animals$body)
0.00 0.02 0.04 0.06 0.08 0.10
Density
−10 −5 0 5 10 15
N = 28 Bandwidth = 1.74
hist(Animals$brain,5)
26
Histogram of Animals$brain
10
8
Frequency
6
4
2
0
−2 0 2 4 6 8 10
Animals$brain
hist(Animals$body,30)
27
Histogram of Animals$body
3.0
2.5
2.0
Frequency
1.5
1.0
0.5
0.0
0 5 10
Animals$body
Task-4. Create a scatter plot of the transformed data. Based on the visual inspection of the plot, provide
your interpretation of the relationship between brain weight and body weight variables. You may like to add
an appropriate smoothed line curve to your plot to help in interpretation.
28
Plot of Body Wt. to Brain Wt.
8
Log(Brain Weight)
6
4
2
0
0 5 10
Log(Body Weight)
#We have used logarithmic transformation to compresses the large values but stretches the small ones
# There is a strong relationship between these weights on the grounds that a large body might well need
29