0% found this document useful (0 votes)
13 views17 pages

WEEK

Uploaded by

m.gamingboy204
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views17 pages

WEEK

Uploaded by

m.gamingboy204
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 17

WEEK – 5

a)How to find a correlation matrix and plot the correlation on iris data set

install.packages("corrplot") # If not already installed

library(corrplot)

# Calculate and plot the correlation matrix

cor_matrix <- cor(iris[, 1:4]) # Correlation for numeric columns

corrplot(cor_matrix, method = "circle", main = "Correlation Matrix of Iris Dataset")

b) Plot the correlation plot on dataset and visualize giving an overview of on iris
data.

corrplot(cor_matrix, method = "circle",

main = "Correlation Matrix of Iris Dataset",

tl.col = "black", tl.srt = 45)


c) Analysis of covariance: variance (ANOVA), if data have categorical variables
on iris data. SOURCE CODE:

library(ggplot2)

# Scatter plot

ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + geom_point()

# Box plot

ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) + geom_boxplot()


# Correlation heatmap

cor_matrix <- cor(iris[, 1:4])

cor_data <- as.data.frame(as.table(cor_matrix))

ggplot(cor_data, aes(Var1, Var2, fill = Freq)) + geom_tile() + scale_fill_gradient2()


WEEK – 6

Create Relationship Model & get the Coefficients

# Reduced data points (3 points each)

x <- c(151, 174, 138)

y <- c(63, 81, 56)

# Apply the lm() function

relation <- lm(y ~ x)

# Print the model

print(relation)
Call:

lm(formula = y ~ x)

Coefficients:

(Intercept) x

-42.0787 0.7046

Get the Summary of the Relationship


# Reduced data points (3 points each)

x <- c(151, 174, 138)

y <- c(63, 81, 56)

# Apply the lm() function

relation <- lm(y ~ x)

# Print the summary of the model

print(summary(relation))
Call:

lm(formula = y ~ x)

Residuals:

1 2 3

-1.3180 0.4759 0.8420

Coefficients:

Estimate Std. Error t value Pr(>|t|)


(Intercept) -42.07874 9.83170 -4.28 0.1461

x 0.70461 0.06341 11.11 0.0571 .

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.635 on 1 degrees of freedom

Multiple R-squared: 0.992, Adjusted R-squared: 0.9839

F-statistic: 123.5 on 1 and 1 DF, p-value: 0.05714

Predict the weight of new persons


# Reduced data points (3 points each)

x <- c(151, 174, 138)

y <- c(63, 81, 56)

# Apply the lm() function

relation <- lm(y ~ x)

# Print the summary of the model

print(summary(relation))
Call:

lm(formula = y ~ x)

Residuals:

1 2 3

-1.3180 0.4759 0.8420

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) -42.07874 9.83170 -4.28 0.1461

x 0.70461 0.06341 11.11 0.0571 .

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.635 on 1 degrees of freedom

Multiple R-squared: 0.992, Adjusted R-squared: 0.9839

F-statistic: 123.5 on 1 and 1 DF, p-value: 0.05714


# Predict y values based on the model

predicted_y <- predict(relation, newdata = data.frame(x = c(160, 145, 170)))

# Print the predicted values

print(predicted_y)
1 2 3

70.65948 60.09027 77.70562

Visualize the Regression Graphically


# Reduced data points (3 points each)

x <- c(151, 174, 138)

y <- c(63, 81, 56)

# Apply the lm() function

relation <- lm(y ~ x)

# Plot the data points

plot(x, y, main = "Height vs Weight Regression",

xlab = "Weight (kg)", ylab = "Height (cm)",

pch = 16, col = "blue")

# Add the regression line

abline(relation, col = "red")


WEEK – 7

# Example dataset

data(mtcars)

# Convert 'cyl' (cylinder) into a factor (categorical variable)

mtcars$cyl <- factor(mtcars$cyl)

# Apply logistic regression

mylogit <- glm(am ~ mpg + wt + cyl, data = mtcars, family = "binomial")

# Print the summary of the logistic regression model

summary(mylogit)
Call:

glm(formula = am ~ mpg + wt + cyl, family = "binomial", data = mtcars)

Coefficients:

Estimate Std. Error z value Pr(>|z|)

(Intercept) 23.92836 14.17738 1.688 0.0915 .

mpg -0.09851 0.35135 -0.280 0.7792

wt -8.17801 3.34965 -2.441 0.0146 *

cyl6 3.00979 2.51067 1.199 0.2306

cyl8 4.98194 3.50934 1.420 0.1557

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

Null deviance: 43.230 on 31 degrees of freedom

Residual deviance: 14.588 on 27 degrees of freedom

AIC: 24.588

Number of Fisher Scoring iterations: 7


WEEK – 8

# Load the mtcars dataset

data(mtcars)

# Convert 'cyl' to a factor

mtcars$cyl <- factor(mtcars$cyl)

# Fit a linear regression model with 'cyl' as the predictor

model <- lm(mpg ~ cyl, data = mtcars)

# Display regression coefficients

coef(summary(model))
Estimate Std. Error t value Pr(>|t|)

(Intercept) 26.663636 0.9718008 27.437347 2.688358e-22

cyl6 -6.920779 1.5583482 -4.441099 1.194696e-04

cyl8 -11.563636 1.2986235 -8.904534 8.568209e-10

# Perform ANOVA on the model

anova(model)
Analysis of Variance Table

Response: mpg

Df Sum Sq Mean Sq F value Pr(>F)

cyl 2 824.78 412.39 39.697 4.979e-09 ***

Residuals 29 301.26 10.39

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
WEEK – 9

Install relevant pacakages

install.packages("rpart.plot")

install.packages("tree")

install.packages("ISLR")

install.packages("rattle")

library(tree)

library(ISLR)

library(rpart.plot)

library(rattle)

# Load data and clean NA values

Hitters <- na.omit(Hitters)

# Log transform Salary

Hitters$Salary <- log(Hitters$Salary)

# Plot transformed Salary

hist(Hitters$Salary, main = "Log-Transformed Salary", xlab = "Log(Salary)")


# Fit regression tree

tree.fit <- tree(Salary ~ Hits + Years, data = Hitters)

summary(tree.fit)
Regression tree:

tree(formula = Salary ~ Hits + Years, data = Hitters)

Number of terminal nodes: 7

Residual mean deviance: 0.002708 = 0.6933 / 256

Distribution of residuals:

Min. 1st Qu. Median Mean 3rd Qu. Max.

-0.2355000 -0.0258400 -0.0005869 0.0000000 0.0332000 0.2069000

# Plot the tree

plot(tree.fit, uniform = TRUE, margin = 0.2)

text(tree.fit, use.n = TRUE, cex = 0.8)


# Split data into training and testing

set.seed(123)

split <- createDataPartition(Hitters$Salary, p = 0.5, list = FALSE)

train <- Hitters[split, ]

test <- Hitters[-split, ]

# Train a tree model on the training data

trees <- tree(Salary ~ ., data = train)

plot(trees)

text(trees, pretty = 0)

# Cross-validation to prune the tree

cv.trees <- cv.tree(trees)


plot(cv.trees)

# Prune tree to best model

prune.trees <- prune.tree(trees, best = 4)

plot(prune.trees)

text(prune.trees, pretty = 0)

# Predict on test data using the pruned tree

yhat <- predict(prune.trees, test)

# Plot predicted vs actual salary

plot(yhat, test$Salary)
abline(0, 1)

# Calculate mean squared error

mean_squared_error <- mean((yhat - test$Salary)^2)

cat("Mean Squared Error:", mean_squared_error, "\n")


Mean Squared Error: 0.002714588
WEEK – 10

Clustering algorithms for unsupervised classification.

# Load the cluster library

library(cluster)

# Set random seed for reproducibility

set.seed(20)

# Apply K-means clustering on Sepal.Width and Petal.Width (columns 3 and 4)

irisCluster <- kmeans(iris[, 3:4], centers = 3, nstart = 20)

# Output the clustering result

print(irisCluster)
K-means clustering with 3 clusters of sizes 52, 48, 50

Cluster means:

Petal.Length Petal.Width

1 4.269231 1.342308

2 5.595833 2.037500

3 1.462000 0.246000

Clustering vector:

[1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3

[46] 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1

[91] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2

[136] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2

Within cluster sum of squares by cluster:

[1] 13.05769 16.29167 2.02200

(between_SS / total_SS = 94.3 %)

Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"

[7] "size" "iter" "ifault"

# Convert cluster assignments to factor

irisCluster$cluster <- as.factor(irisCluster$cluster)

# Plot the clustering results

library(ggplot2)

ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) +

geom_point() +

labs(title = "K-means Clustering on Iris Dataset",

x = "Petal Length", y = "Petal Width")

# Create a distance matrix from the mtcars dataset

d <- dist(as.matrix(mtcars))

# Apply hierarchical clustering

hc <- hclust(d)

# Plot the dendrogram

plot(hc, main = "Hierarchical Clustering of mtcars", xlab = "", sub = "")


# Generate a synthetic dataset with two clusters

x <- rbind(cbind(rnorm(10, 0, 0.5), rnorm(10, 0, 0.5)),

cbind(rnorm(15, 5, 0.5), rnorm(15, 5, 0.5)))

# Apply PAM clustering with 2 clusters and plot the result

library(cluster)

clusplot(pam(x, 2))

# Add noise (25 random values) to the dataset

x4 <- cbind(x, rnorm(25), rnorm(25))

# Apply PAM clustering with 2 clusters and plot the result

library(cluster)

clusplot(pam(x4, 2))

You might also like