Data Scinece Practical File
Data Scinece Practical File
ON
Data Science Using R
LAB
PMC 401
1
1. Write a program to implement Linear Regression on california_housing.csv
install.packages("caret")
# Load necessary package
library(caret)housing <- read.csv("/content/california_housing_train.csv")
head(housing)
set.seed(123)trainIndex <- createDataPartition(housing$median_house_value, p = 0.7, list = FALSE)
trainData <- housing[trainIndex, ]
testData <- housing[-trainIndex, ]
model <- lm(median_house_value ~ ., data = trainData)
summary(model)
predictions <- predict(model, newdata = testData)
actual <- testData$median_house_value
rmse <- sqrt(mean((actual - predictions)^2))
print(rmse)
Output :
A data.frame: 6 × 9
Call:
lm(formula = median_house_value ~ ., data = trainData)
Residuals:
Min 1Q Median 3Q Max
-562309 -43874 -11983 30091 762778
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.712e+06 8.363e+04 -44.386 < 2e-16 ***
longitude -4.409e+04 9.541e+02 -46.216 < 2e-16 ***
latitude -4.354e+04 9.011e+02 -48.321 < 2e-16 ***
housing_median_age 1.151e+03 5.760e+01 19.976 < 2e-16 ***
total_rooms -8.212e+00 1.033e+00 -7.948 2.06e-15 ***
2
total_bedrooms 1.298e+02 9.571e+00 13.557 < 2e-16 ***
population -3.685e+01 1.375e+00 -26.794 < 2e-16 ***
households 2.829e+01 1.054e+01 2.683 0.00732 **
median_income 4.018e+04 4.415e+02 91.019 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
A data.frame: 6 × 5
Sepal.Lengt Petal.Widt
Sepal.Width Petal.Length Species
h h
3
A data.frame: 6 × 5
Sepal.Lengt Petal.Widt
Sepal.Width Petal.Length Species
h h
Call:
lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = iris)
Residuals:
Min 1Q Median 3Q Max
-0.82816 -0.21989 0.01875 0.19709 0.84570
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.85600 0.25078 7.401 9.85e-12 ***
Sepal.Width 0.65084 0.06665 9.765 < 2e-16 ***
Petal.Length 0.70913 0.05672 12.502 < 2e-16 ***
Petal.Width -0.55648 0.12755 -4.363 2.41e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
A data.frame: 6 × 6
Sepal.Lengt
Sepal.Width Petal.Length Petal.Width Species predicted
h
[1] 0.3103268
4
# Load required libraries
library(caret)
library(rpart)
# Calculate accuracy
accuracy <- sum(predictions == testData$Species) / nrow(testData)
print(accuracy)
Output :
[1] 0.9333333
3. Write a program to implement a Decision Tree on california_housing_train.csv using ANOVA and Type 2 visualization techniques.
data(iris)
head(iris)
set.seed(123)
trainIndex<-createDataPartition(iris$Petal.Length, p=0.7, list=FALSE)
trainData<-iris[trainIndex, ]
testData<-iris[-trainIndex, ]
tree_model<-rpart(Petal.Length ~ ., data=trainData, method="anova")
print(tree_model)
predictions <-predict(tree_model,testData)
actual <-testData$Petal.Length
mse <-sqrt(mean(actual - predictions)^2)
print(mse)
5
rpart.plot(tree_model,type=2, fallen.leaves = TRUE)
Output :
A data.frame: 6 × 5
Sepal.Lengt Petal.Widt
Sepal.Width Petal.Length Species
h h
Output :
[1] "Original Data:"
Age Marks
1 20 80
2 NA NA
3 56 45
4 NA 16
5 78 NA
[1] "Count of Missing Values:"
Age Marks
2 2
[1] "Data after handling missing values:"
Age Marks
1 20.00000 80
2 51.33333 47
3 56.00000 45
4 51.33333 16
5 78.00000 47
[1] "Most frequent Marks value (Mode): 47"
6. Write an R program to perform descriptive statistics (Mean, Median, Mode) on a given dataframe.
# Sample Data
7
data <- data.frame(
Age = c(25, 30, 30, 35, 40),
Marks = c(80, 85, 85, 90, 85)
)
# Mode (one-liner)
mode_marks <- names(sort(table(data$Marks), decreasing = TRUE))[1]
# Print results
cat("Mean (Marks):", mean_marks, "\n")
cat("Median (Marks):", median_marks, "\n")
cat("Mode (Marks):", mode_marks, "\n")
Output :
Mean (Marks): 85
Median (Marks): 85
Mode (Marks): 85
# Sample dataset
data <- data.frame(
Age = c(22, 25, 30, 35, 40, 45, 50),
Marks = c(70, 85, 90, 75, 60, 95, 80)
)
# View data
print("Original Data:")
print(data)
# Mean
mean_age <- mean(data$Age)
mean_marks <- mean(data$Marks)
# Median
median_age <- median(data$Age)
median_marks <- median(data$Marks)
# Standard deviation
8
sd_age <- sd(data$Age)
sd_marks <- sd(data$Marks)
# Variance
var_age <- var(data$Age)
var_marks <- var(data$Marks)
# Output
cat("Mean Age:", mean_age, "\n")
cat("Mean Marks:", mean_marks, "\n")
cat("Median Age:", median_age, "\n")
cat("Median Marks:", median_marks, "\n")
cat("Standard Deviation (Age):", sd_age, "\n")
cat("Standard Deviation (Marks):", sd_marks, "\n")
cat("Variance (Age):", var_age, "\n")
cat("Variance (Marks):", var_marks, "\n")
cat("Min Age:", min_age, " | Max Age:", max_age, "\n")
cat("Mode Age:", mode_age, "\n")
cat("Mode Marks:", mode_marks, "\n")
Output :
[1] "Original Data:"
Age Marks
1 22 70
2 25 85
3 30 90
4 35 75
5 40 60
6 45 95
7 50 80
[1] "Summary:"
Age Marks
Min. :22.00 Min. :60.00
1st Qu.:27.50 1st Qu.:72.50
Median :35.00 Median :80.00
Mean :35.29 Mean :79.29
3rd Qu.:42.50 3rd Qu.:87.50
Max. :50.00 Max. :95.00
Mean Age: 35.28571
Mean Marks: 79.28571
9
Median Age: 35
Median Marks: 80
Standard Deviation (Age): 10.35558
Standard Deviation (Marks): 12.05148
Variance (Age): 107.2381
Variance (Marks): 145.2381
Min Age: 22 | Max Age: 50
Mode Age: 22
Mode Marks: 60
# Histogram of Marks
hist(data$Marks,
main = "Histogram of Marks",
xlab = "Marks",
col = "lavender"
)
Output :
10
9. Write a program to implement Naïve Bayes on iris.csv
11
# Train Naive Bayes model
nb_model <- naiveBayes(Species ~ SepalLengthCm + SepalWidthCm + PetalLengthCm + PetalWidthCm,
data = train_data)
# Confusion matrix
conf_matrix_nb <- table(Predicted = nb_predictions, Actual = test_data$Species)
print("Confusion Matrix:")
print(conf_matrix_nb)
# Accuracy
accuracy <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
Output :
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
12
# Apply DBSCAN on scaled data
set.seed(123)
dbscan_result <- dbscan(numeric_data_scaled, eps = 0.5, minPts = 3)
# Print result
print(dbscan_result)
Output :
Loading required package: dbscan
as.dendrogram
0 1 2 3 4 5 6 7
18 44 73 3 3 3 3 3
11. Write a program to implement k-Nearest Neighbors (KNN), a supervised algorithm using Euclidean distance.
iris_data$Species<- as.factor(iris$Species)
set.seed(123)
13
train_index<- sample(seq_len(nrow(iris_data)), size=0.7*nrow(iris_data))
train_data<-iris_data[train_index, ]
test_data<-iris_data[-train_index, ]
predictor_cols<-c("SepalLengthCm", "SepalWidthCm" , "PetalLengthCm","PetalWidthCm")
train_features<-train_data[,predictor_cols]
test_features<-test_data[,predictor_cols]
train_labels<-train_data$Species
test_labels<-test_data$Species
knn_pred<-knn(train =train_features,test=test_features,cl=train_labels, k=3)
conf_matrix<-table(Predicted = knn_pred, Actual = test_labels)
print(conf_matrix)
accuracy<-sum(diag(conf_matrix))/sum(conf_matrix)
print(accuracy)
Output :
[1] "Id" "SepalLengthCm" "SepalWidthCm" "PetalLengthCm"
[5] "PetalWidthCm" "Species"
Actual
Predicted setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 0
virginica 0 1 13
[1] 0.9777778
14
# Add cluster labels to original data
california_data$Cluster <- kmeans_result$cluster
1 2 3
8943 1394 6663
# Load dataset
data <- read.csv("/content/california_housing_train.csv")
15
# Select features (numeric predictors)
features <- c("longitude", "latitude", "housing_median_age", "total_rooms",
"total_bedrooms", "population", "households", "median_income")
16
kmeans_result <- kmeans(iris_features, centers = 3, nstart = 25)
17
train_data <- data[sample_index, ]
test_data <- data[-sample_index, ]
# Evaluate model
rmse <- sqrt(mean((test_data$Total_Purchase - predictions)^2))
r2 <- cor(test_data$Total_Purchase, predictions)^2
Call:
lm(formula = Total_Purchase ~ Years, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-9954.8 -1576.0 14.4 1689.0 6824.3
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9826.35 390.69 25.151 <2e-16 ***
Years 43.35 71.85 0.603 0.546
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
18
conf<-table(Predicted=predicted,Actual=iris$Species)
print(conf)
accuracy<-sum(diag(conf)/sum(conf))
cat("Accuracy",accuracy)
OUTPUT:
nstalling package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
A data.frame: 6 × 5
Coefficients:
(Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
versicolor 18.69037 -5.458424 -8.707401 14.24477 -3.097684
virginica -23.83628 -7.923634 -15.370769 23.65978 15.135301
Std. Errors:
(Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
versicolor 34.97116 89.89215 157.0415 60.19170 45.48852
virginica 35.76649 89.91153 157.1196 60.46753 45.93406
18. Write an R program to perform tokenization on a given text using the following packages: Tokenizers, Tidytext, Quanteda, and
Text2vec.
19
library(tibble)
text <- "Tokenization is the process of breaking text into tokens."
# Using tokenizers
tokenizer <- tokenize_words(text)
print("Tokens using tokenizers:")
print(tokenizer)
# Using tidytext
text_df <- tibble(line = 1, text = text)
tokens_tidytext <- text_df %>% unnest_tokens(word, text)
print("Tokens using tidytext:")
print(tokens_tidytext$word)
# Using quanteda
tokens_quanteda <- tokens(text, what = "word")
print("Tokens using quanteda:")
print(as.list(tokens_quanteda)[[1]])
# Using text2vec
it <- itoken(text, tokenizer = word_tokenizer, progressbar = FALSE)
tokens_text2vec <- iterators::iter_next(it)
print("Tokens using text2vec:")
print(tokens_text2vec)
19. Write an R program to perform stemming on a list of words using the following libraries: Snowballc, Quanteda, and Tidytext.
# Load libraries
library(SnowballC)
library(quanteda)
library(tidytext)
library(dplyr)
library(tibble)
install.packages(c("e1071", "caret"))
library(e1071)
library(caret)
data(iris)
iris_data <- iris
iris_data$Species <- as.factor(iris_data$Species)
set.seed(123)
sample_index <- createDataPartition(iris_data$Species, p = 0.8, list = FALSE)
train_data <- iris_data[sample_index, ]
test_data <- iris_data[-sample_index, ]
svm_model <- svm(Species ~ ., data = train_data, kernel = "linear")
predictions <- predict(svm_model, test_data)
conf_matrix <- table(Predicted = predictions, Actual = test_data$Species)
print(conf_matrix)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(accuracy)
OUTPUT:
Actual
Predicted setosa versicolor virginica
setosa 10 0 0
versicolor 0 10 1
virginica 0 0 9
[1] 0.9666667
install.packages(c("e1071","caret"))
library(caret)
library(e1071)
housing<-read.csv("/content/sample_data/california_housing_train.csv")
head(housing)
set.seed(123)
trainIndex=createDataPartition(housing$median_house_value,p=0.8,list=FALSE)
trainData<-housing[trainIndex,]
testData<-housing[-trainIndex,]
model<-svm(median_house_value~.,data=trainData)
summary(model)
predicted=predict(model,testData)
actual=testData$median_house_value
mse<-sqrt(mean(predicted-actual)^2)
rmse<-sqrt(mse)
print(mse)
21
print(rmse)
OUTPUT:
A data.frame: 6 × 9
Call:
svm(formula = median_house_value ~ ., data = trainData)
Parameters:
SVM-Type: eps-regression
SVM-Kernel: radial
cost: 1
gamma: 0.125
epsilon: 0.1
[1] 7433.255
[1] 86.21633
install.packages("textstem")
library(textstem)
words <- c("playing", "played", "happily", "better", "studies", "running", "flies")
lemmatized_words <- lemmatize_words(words)
cat("Original Words:\n")
print(words)
cat("\nLemmatized Words:\n")
print(lemmatized_words)
OUTPUT:
also installing the dependencies ‘NLP’, ‘zoo’, ‘dtt’, ‘sylly.en’, ‘sylly’, ‘syuzhet’, ‘english’, ‘mgsub’,
‘qdapRegex’, ‘slam’, ‘koRpus.lang.en’, ‘hunspell’, ‘koRpus’, ‘lexicon’, ‘textclean’, ‘textshape’
available.koRpus.lang()
tokens, types
Original Words:
[1] "playing" "played" "happily" "better" "studies" "running" "flies"
22
Lemmatized Words:
[1] "play" "play" "happily" "good" "study" "run" "fly"
24. Write an R program to implement different types of stemmers on a set of words using the Porter Stemmer and the SnowballC Stemmer.
install.packages("SnowballC")
library(SnowballC)
words <- c("running", "runs", "easily", "fairly", "happily", "flying", "flies")
# Porter Stemmer (default in SnowballC for English)
porter_stems <- wordStem(words, language = "en")
cat("Porter Stemmer (SnowballC default):\n")
print(data.frame(Original = words, Stemmed = porter_stems))
OUTPUT:
Porter Stemmer (SnowballC default):
Original Stemmed
1 running run
2 runs run
3 easily easili
4 fairly fair
5 happily happili
6 flying fli
7 flies fli
23