BDA Lab Manual (12 Weeks)
BDA Lab Manual (12 Weeks)
III-B.Tech II-Semester L T P C
Course Code: A1DS604PC O 0 3 1.5
COURSE OBJECTIVES:
COURSE OUTCOMES:
LIST OF EXPERIMENTS:
1. Study of R Programming.
2. Hypothesis Test using R
3. K-means Clustering using R
4. Naïve Bayesian Classifier
5. Implementation of Linear Regression
6. Implement Logistic Regression
7. Time-series Analysis
8. Association Rules using R.
9. Data Analysis-Visualization using R.
10. Map Reduce using Hadoop
11. In-database Analytics
12. Implementation of Queries using Mongo DB
WEEK - 1 STUDY OF R PROGRAMMING
1. Study of R Programming
#program:
# R Programming Basics
# Plotting
ggplot(data, aes(x = x, y = y)) +
geom_point(color = "blue") +
ggtitle("Simple Scatter Plot") +
xlab("X-axis") +
ylab("Y-axis")
#program:
# Perform a t-test
t_test_result <- t.test(group1, group2)
#program:
cat("\nCluster Assignments:\n")
print(kmeans_result$cluster)
Cluster Assignments:
> print(kmeans_result$cluster)
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[37] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 2 2 2 2 3 2 2 2 2 2 2
[73] 2 2 3 3 3 3 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3
[109] 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3
[145] 3 3 2 3 3 2
#program:
# Install and load the necessary package (if not already installed)
# install.packages("e1071")
library(e1071)
# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy, 2)))
#Output:
#program:
#Output:
Call:
lm(formula = y ~ x, data = data)
Residuals:
Min 1Q Median 3Q Max
-10.0560 -3.1111 -0.4097 3.3295 10.7983
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.34508 1.34274 0.257 0.798
x 1.99321 0.04583 43.494 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#program:
# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy, 2)))
#Output:
Call:
glm(formula = IsVersicolor ~ Sepal.Length + Sepal.Width + Petal.Length +
Petal.Width, family = "binomial", data = train_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.2596 2.8631 2.186 0.0288 *
Sepal.Length -0.5910 0.7564 -0.781 0.4346
Sepal.Width -2.0960 0.8222 -2.549 0.0108 *
Petal.Length 1.6183 0.8229 1.967 0.0492 *
Petal.Width -3.0218 1.4200 -2.128 0.0333 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
binary_predictions 0 1
0 24 10
1 3 8
1.Time-series Analysis
#program
#output
> library(forecast)
> library(tseries)
> library(xts)
>
> # Load the AirPassengers dataset
> data("AirPassengers")
>
> # Convert to time series object
> ts_data <- ts(AirPassengers, start = c(1949, 1), frequency = 12)
>
> # Display the first few rows of the dataset
> print(head(ts_data))
Jan Feb Mar Apr May Jun
1949 112 118 132 129 121 135
>
> # Plot the time series data
> plot(ts_data, main="AirPassengers Data", ylab="Number of Passengers",
xlab="Year")
>
> # Decompose the time series
> decomposed <- decompose(ts_data)
> plot(decomposed)
>
> # Perform the Augmented Dickey-Fuller test
> adf_test <- adf.test(ts_data)
> print(adf_test)
data: ts_data
Dickey-Fuller = -7.3186, Lag order = 5, p-value = 0.01
alternative hypothesis: stationary
>
> # Difference the series if necessary
> diff_ts_data <- diff(ts_data)
> plot(diff_ts_data, main="Differenced AirPassengers Data", ylab="Differenced
Number of Passengers", xlab="Year")
>
> # Fit an ARIMA model
> fit <- auto.arima(ts_data)
> print(fit)
Series: ts_data
ARIMA(2,1,1)(0,1,0)[12]
Coefficients:
ar1 ar2 ma1
0.5960 0.2143 -0.9819
s.e. 0.0888 0.0880 0.0292
Ljung-Box test
>
> # Calculate accuracy metrics
> accuracy(forecasted)
ME RMSE MAE MPE MAPE MASE ACF1
Training set 1.3423 10.84619 7.86754 0.420698 2.800458 0.245628 -0.00124847
#program:
# Load required packages
install.packages("arules")
library(arules)
# Visualize rules
install.packages("arulesViz")
library(arulesViz)
plot(rules)
#Output:
> library(arulesViz)
> plot(rules)
WEEK - 9 Data Analysis-Visualization using R.
#program:
# Load required packages
install.packages("ggplot2")
install.packages("dplyr")
install.packages("tidyr")
library(ggplot2)
library(dplyr)
library(tidyr)
# Histogram of mpg
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "blue", color = "black") +
labs(title = "Histogram of MPG",
x = "Miles Per Gallon",
y = "Frequency")
#output
#program:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String line = value.toString();
String[] words = line.split("\\s+");
for (String w : words) {
word.set(w);
context.write(word, one);
}
}
}
#Output
1.In-database Analytics
#Output
#program:
db.createCollection("productDetails")
db.productDetails.insertMany([
{ product: "Product A", category: "Electronics", price: 300 },
{ product: "Product B", category: "Furniture", price: 450 },
{ product: "Product C", category: "Appliances", price: 200 }
])
// Basic Queries
print("Find All Documents")
printjson(db.salesData.find().toArray())
print("Sorting")
printjson(db.salesData.find().sort({ salesAmount: -1 }).toArray())
// Aggregation Framework
print("Simple Aggregation (Total Sales by Product)")
printjson(db.salesData.aggregate([
{ $group: { _id: "$product", totalSales: { $sum: "$salesAmount" } } }
]).toArray())
// Advanced Queries
print("Using $lookup for Joins")
printjson(db.salesData.aggregate([
{
$lookup: {
from: "productDetails",
localField: "product",
foreignField: "product",
as: "productInfo"
}
},
{ $unwind: "$productInfo" },
{ $project: { product: 1, salesAmount: 1, salesDate: 1, category: "$productInfo.category",
price: "$productInfo.price" } }
]).toArray())