0% found this document useful (0 votes)
14 views9 pages

R Code

The document provides a comprehensive guide on data visualization techniques using R, including importing data, exploring quantitative and qualitative data, and creating various plots such as histograms, scatterplots, and bar charts. It also covers descriptive statistics, data subsetting, and Poisson distribution analysis. Additionally, it includes exercises and examples for practical application of the concepts discussed.

Uploaded by

harrypoter
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views9 pages

R Code

The document provides a comprehensive guide on data visualization techniques using R, including importing data, exploring quantitative and qualitative data, and creating various plots such as histograms, scatterplots, and bar charts. It also covers descriptive statistics, data subsetting, and Poisson distribution analysis. Additionally, it includes exercises and examples for practical application of the concepts discussed.

Uploaded by

harrypoter
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

########################################################################

# Data Visualization - 01
########################################################################

rm(list=ls())

# Importing data csv files


# https://www.kaggle.com/datasets/spscientist/students-performance-in-
exams?resource=download

# Insert the address of "Data.csv" file


help("read.csv")

#data <- read.csv("C:/Users/DIM/Downloads/Data.csv")

read.csv("F:/Msc(Kealniya)/1st Semi/Statistics_for_Data_Science _MDAN


51163/Lecturer 01/Data.csv")
data <- read.csv("F:/Msc(Kealniya)/1st Semi/Statistics_for_Data_Science
_MDAN 51163/Lecturer 01/Data.csv")
data = read.csv("F:/Msc(Kealniya)/1st Semi/Statistics_for_Data_Science
_MDAN 51163/Lecturer 01/Data.csv")

####### Data exploration


head(data)

# Data structure
str(data)

########################################################################
# Quantitative data
########################################################################

data$math_score

# Plot the histogram of math_score


help("hist")
hist(data$math_score)
hist(data$reading_score)

hist(data$math_score,
main="Histogram of Math Score",
xlab="Math Score", ylab="Frequency")

# Frequency polygon of the math_score


#install.packages("ggplot2")
library(ggplot2)

help(ggplot)

ggplot(data, aes(data$math_score)) +
geom_freqpoly(bins=10)

ggplot(data, aes(math_score)) + geom_freqpoly(bins=10)


# Density plot of math_score
density(data$math_score)

plot(density(data$math_score), main="Density of Math Score",


xlab="Math Score", ylab="Density")

# Scatterplot of math_score vs writing scores


plot(data$math_score, data$writing_score,
main="Scatterplot",
xlab="Math score", ylab="Writing score", pch= 19)

# Line plot
plot(data$math_score, data$writing_score, type = "o", main="Line plot",
xlab="Height", ylab="weight", pch=19)

help(grid)

########################################################################
# Qualitative data
########################################################################

# Extract group data


Race_Group <- data$race
# Frequency table of group data
Group <- table(Race_Group)

# Pie chart
help(pie)
pie(Group, main="Pie Chart")

# Basic barplot
help("ggplot")
ggplot(data=Group)

# Constructing a data frame


help("data.frame")

table(Race_Group)

group_label =c("A", "B", "C", "D", "E") # Group names


count =c(89, 190, 319, 262, 140) # count of each group

group_count = data.frame(group_label, count)

# Basic barplot
ggplot(data=group_count, aes(x=group_label, y=count)) +
geom_bar(stat="identity")

help("geom_bar")

# Pareto Chart

#install.packages("qcc")
library(qcc)

pareto.chart(count)

# Extracting gender and race data


x <- subset(data, select = c(gender, race))

table(x)

grp = rep(c("A", "B", "C", "D", "E"),2)


gen = c("F", "F", "F", "F", "F","M","M","M","M","M")
fre = c(36, 104, 180, 129, 69, 53, 86, 139, 133, 71)

df <- data.frame(grp, gen, fre)

# Multiple barplot

# barplot with multiple groups


ggplot(data=df, aes(x=grp, y=fre, fill=gen)) +
geom_bar(stat="identity")

# Use position=position_dodge()
ggplot(data=df, aes(x=grp, y=fre, fill=gen)) +
geom_bar(stat="identity", position=position_dodge())

# Sorting data based on the ascending order of math_score


data_sorted <- data[order(data$math_score),]
# Sorting data based on the descending order of math_score
data_sorted_ascending <- data[order(data$math_score, decreasing = TRUE),]

#########################################################################
######

# visualizing multivariate data:

# "women" dataset is available in R: https://stat.ethz.ch/R-manual/R-


devel/library/datasets/html/women.html
help(women)
data <- women

h <- data$height
w <- data$weight

# Scatterplot
plot(women, xlab = "Height (in)", ylab = "Weight (lb)",
main = "women data: American women aged 30-39")
grid(nx = 10, ny = 10)

# Line plot
plot(h, w, type = "o", main="Line plot",
xlab="Height", ylab="weight", pch=19)
help(grid)
#########################################################################
######

##Group Activity: Perform a descriptive analysis for your dataset


and interpret your results.

#Group/Room Dataset Data Description


#1 airquality https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/airquality.html
#2 attenu https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/attenu.html
#3 freeny https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/freeny.html
#4 iris https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/iris.html
#5 quakes https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/quakes.html
#6 rock https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/rock.html
#7 stackloss https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/stackloss.html
#8 swiss https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/swiss.html

# Check the datasets available in R: https://stat.ethz.ch/R-manual/R-


devel/library/datasets/html/00Index.html

#########################################################################
######

## Subsetting data

# Extract data between entry 30 and entry 55 for the first 3 variables
data_subset1 <- data[30:55, 1:3]
# Extract data between entry 30 and entry 55 for the first, second, and
fifth variables
data_subset1_1 <- data[30:55, c(1,2,5)]

# Extract data of the male students:


help("subset")
data_subset2 <- subset(data, gender == "male")

# Extract data of the male students who scored more than 75 for
mathematics:
data_subset3 <- subset(data, gender == "male" & math_score > 75)

# Extract data of the male students or students who scored more than 75
for mathematics:
data_subset3_1 <- subset(data, gender == "male" | math_score > 75)
# scenario 1: gender == "male"
# scenario 2: math_score > 75
# scenario 3: gender == "male" and math_score > 75
# Extract scores of the male students who scored more than 75 for
mathematics:
data_subset4 <- subset(data, gender == "male" & math_score > 75,
select = c(math_score, reading_score,
writing_score))

# Alternative method
data_subset3 <- subset(data, gender == "male" & math_score > 75)
#data_subset4_1 <- subset(data_subset3, select = c(math_score,
reading_score, writing_score))
data_subset4_2 <- data_subset3[, 2:4]

# To get a random sample from the data whole data set


set.seed(123) # To generate the same random sample
# Randomly select 40% of the data as a sample
sample_40 <- sample(c(TRUE, FALSE), nrow(DATA), replace=TRUE,
prob=c(0.4,0.6))
SAMPLE <- DATA[sample_40, ]

########################################################################
# Descriptive Statistics 02
########################################################################

# Create a univariate dataset


data <- c(4,10, 5, 8, 7.5, 8, 5, 16.5, 1, 7.8, 8, 10, 11, 18, 15,9, 14,
23, 21, 28)

# Mean
help(mean)
mean(data)
Mean_data <- mean(data)

# Median
help(median)
median(data)
med <- median(data)
MED = median(data)

# Mode
# We use frequency table
table(data)

# Range
max(data) # Maximum
min(data) # Minimum
# Calculating range
Range = max(data) - min(data)

# Standard deviation
help(sd)
sd(data)
# Variance
var(data)

# Coefficient of Variation
# Coefficient of Variation = std dev/mean*100

cv <- sd(data) / mean(data) * 100

# Inter Quartile Range


help("quantile")
Q1 <- quantile(data, c(.25))

quantile(data, c(.25, .75))


IQR(data)

# Five-number summary
summary(data)

# Boxplot
help("boxplot")
boxplot(data)

# Histogram
data <- c(4,10, 5, 8, 7.5, 8, 5, 16.5, 1, 7.8, 8, 10, 11, 18, 15,9, 14,
23, 21, 28)
hist(data)
help(hist)

# Density plot
density(data)
plot(density(data))

# Skewness & kurtosis


#install.packages("moments")
library(moments)

help(skewness)
skewness(data)
kurtosis(data)

## Data set with missing value

# Create a vector.
x <- c(12,7,3,4.2,18,2,54,-21,8,-5,NA)

# Find mean with missing data


help(mean)
mean_WithMissing <- mean(x)
mean_WithMissing

# Find mean dropping missing data (NA entries)


mean_WithOutMissing <- mean(x,na.rm = TRUE)
mean_WithOutMissing

#########################################################################
######
## Perform a descriptive analysis for the "iris" dataset and interpret
your results.

# iris: https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/iris.html

help(iris)

DATA <- iris

########################################################################
# Poisson Distribution 04

# Content: Density, distribution function, and random generation


# for the Poisson distribution with parameter lambda(=mu).

# Density function: dpois(x, lambda, log = FALSE)


# Distribution function (cumulative probability): ppois(q, lambda,
lower.tail = TRUE, log.p = FALSE)
# Random Generation: rpois(n, lambda)

########################################################################

rm(list=ls()) # Clear the Environment

# Example:

# RDA investigated that there are twelve cars crossing a bridge per
minute on average.

#(a) Find the probability of having


# (i) no cars
# (ii) three or more cars
# (iii) less than 17 cars
# crossing the bridge in a particular minute.

#(b) Plot the probability distribution of No of cars crossing the bridge.

# X = The number of cars crossing the bridge in a particular minute


# x = 0, 1, 2, 3,.....

# Poisson distribution with parameter lambda(=mu).


# X follows a Poisson(lamda = 12) distribution

help(dpois)

#(i) no cars: X=0


x = 0
# (i) P(X = 0)
P_X_0 <- dpois(x=0, lambda=12)
P_X_0

sprintf("P(X = 0) = %s", round(P_X_0, digits = 6))


sprintf("The probability of no cars crossing the bridge in a minute is
%s", round(P_X_0, digits = 6))
#(ii) three or more cars: X >= 3
# (ii) P(X >= 3) = 1 - P(X <= 2)
P_X_geq_3 <- 1 - ppois(2, lambda=12) # lower tail
P_X_geq_3

# P(X >= 3)
P_X_GEQ_3 <- ppois(3, lambda=12, lower=FALSE) # upper tail
P_X_GEQ_3
sprintf("P(X >= 3) = %s", round(P_X_GEQ_3, digits = 4))

#(iii) less than 17 cars


# (iii) P(X <= 17)
P_X_leq_17 <- ppois(17, lambda=12) # lower tail
P_X_leq_17
sprintf("P(X <= 17) = %s", round(P_X_leq_17, digits = 4))

# (b) The Poisson probability distribution plot

x <- 0:20

par(mfrow = c(2, 1))

# Probability density function (pdf)


barplot(dpois(x, lambda=12),col = "red",names.arg=x,
xlab = "X = No of cars crossing the bridge", ylab = "pdf: P(X =
x)",
main="Poisson (mu = 12) pdf")

# Cumulative density function (cdf)


barplot(ppois(x, lambda=12),col = "red",names.arg=x,
xlab = "X = No of cars crossing the bridge", ylab = "cdf:P(X <=
x)",
main="Poisson (mu = 12) cdf")

########################################################################
# Extra: Random generation for a Poisson distribution with parameter
lambda(=mu).

# rpois(n, lambda)

#Create a data set of 30 samples from a Poisson distribution with lambda


= 6.23
set.seed(2) # to get the sample
rpois(n=30, lambda = 6.23)

########################################################################
# Exercise:

# The number of accidents that occur at a busy intersection is Poisson


distributed
# with a mean of 3.5 per week. Find the probability of the following
events:
# (a) Less than three accidents in a week
# (b) Five or more accidents in a week
# (c) No accidents today

You might also like