0% found this document useful (0 votes)
4 views12 pages

Appendix

The document contains R code for data analysis on PM2.5 levels in Worli, Mumbai, focusing on handling missing data through various imputation methods. It includes functions for generating missing values, performing mean, median, and kNN imputations, and calculating metrics like Mean Absolute Error (MAE), Root Mean Square Error (RMSE), and R-squared (R2) for evaluating the imputation accuracy. The code also simulates different missing data scenarios and assesses the performance of the imputation techniques.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views12 pages

Appendix

The document contains R code for data analysis on PM2.5 levels in Worli, Mumbai, focusing on handling missing data through various imputation methods. It includes functions for generating missing values, performing mean, median, and kNN imputations, and calculating metrics like Mean Absolute Error (MAE), Root Mean Square Error (RMSE), and R-squared (R2) for evaluating the imputation accuracy. The code also simulates different missing data scenarios and assesses the performance of the imputation techniques.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 12

Appendix

R Codes:

data <- Worli_Mumbai_PM2_5


head(data)
summary(data)
colSums(is.na(data))
missing_percent <- colSums(is.na(data)) / nrow(data) * 100
missing_percent
#missing percent in original data = 7.68

complete_data <- data[complete.cases(data),]


head(complete_data)
cat("Number of complete cases:", nrow(complete_data))
#complete case data: 7422 observations

# Simulation using complete case data


if(nrow(complete_data) == 7422) {
cat("The complete dataset contains 7422 observations. Ready for simulation!\n")
} else {
cat("The complete dataset has", nrow(complete_data), "observations. Check for missing values.\n")
}

library(writexl)
library(WriteXLS)
write_xlsx(complete_data, "Complete_Cases_PM25.xlsx")

Complete_Cases_PM25
cat("Number of complete cases:", nrow(complete_data))

generate_missing_pm25 <- function(data, missing_percent) {


set.seed(123) # Ensure reproducibility

# Total number of rows in the PM2.5 column


total_entries <- nrow(data)

# Calculate the number of missing values needed


num_missing <- round(total_entries * missing_percent / 100)

# Randomly select row indices for missing values


missing_rows <- sample(seq_len(total_entries), num_missing)

# Introduce missing values only in the PM2.5 column


data$`PM2.5`[missing_rows] <- NA

return(data)
}

data_missing_5 <- generate_missing_pm25(complete_data, 5)


View(data_missing_5)
data_missing_10 <- generate_missing_pm25(complete_data, 10)
View(data_missing_10)
data_missing_15 <- generate_missing_pm25(complete_data, 15)
View(data_missing_15)
data_missing_20 <- generate_missing_pm25(complete_data, 20)
View(data_missing_20)

# 500 Simulated patterns


simulate_missing_patterns <- function(data, missing_percent, num_simulations) {
# Initialize a list to store simulated datasets
simulations <- list()

for (i in 1:num_simulations) {
# Generate missing data
simulated_data <- generate_missing_pm25(data, missing_percent)
# Store in the list
simulations[[i]] <- simulated_data
}

return(simulations)
}

num_simulations <- 500


simulations_5 <- simulate_missing_patterns(complete_data, 5, num_simulations)
simulations_10 <- simulate_missing_patterns(complete_data, 10, num_simulations)
simulations_15 <- simulate_missing_patterns(complete_data, 15, num_simulations)
simulations_20 <- simulate_missing_patterns(complete_data, 20, num_simulations)

# 16/12/24

# Mean Imputation
mean_imputed <- complete_data
mean_value <- mean(mean_imputed$`PM2.5`, na.rm = TRUE)
mean_imputed$`PM2.5`[is.na(mean_imputed$`PM2.5`)] <- mean_value
mean_value

# Median Imputation
median_imputed <- complete_data
median_value <- median(median_imputed$`PM2.5`, na.rm = TRUE)
median_imputed$`PM2.5`[is.na(median_imputed$`PM2.5`)] <- median_value
median_value

install.packages("VIM")
library(VIM)
# kNN Imputation with k=5
knn_imputed <- kNN(complete_data, variable = "PM2.5", k = 5)

# View results
summary(knn_imputed$`PM2.5`)

install.packages("mice")
library(mice)
# Perform Multiple Imputation
mice_imputed <- mice(complete_data, m = 5, method = "pmm", maxit = 10, seed = 123)

# Extract the completed dataset


completed_data <- complete(mice_imputed, 1)

# View results
summary(completed_data$`PM2.5`)

# Check if complete_data exists and is not empty


if (exists("complete_data") && nrow(complete_data) > 0) {

# Create a copy of the dataset


simulated_data <- complete_data

# Simulate Missing Values (10% Missing in PM2.5)


set.seed(123)
missing_indices <- sample(1:nrow(simulated_data), size = 0.1 * nrow(simulated_data))

# Introduce Missing Values in PM2.5


simulated_data$`PM2.5`[missing_indices] <- NA

# Apply Mean Imputation


mean_imputed <- simulated_data
mean_value <- mean(mean_imputed$`PM2.5`, na.rm = TRUE)
mean_imputed$`PM2.5`[is.na(mean_imputed$`PM2.5`)] <- mean_value

# Calculate MAE
MAE <- mean(abs(complete_data$`PM2.5` - mean_imputed$`PM2.5`), na.rm = TRUE)
cat("Mean Absolute Error (MAE):", MAE, "\n")

} else {
cat("Error: Dataset 'complete_data' is empty or not loaded properly.\n")
}

# Check if complete_data exists and is not empty


if (exists("complete_data") && nrow(complete_data) > 0) {

# Create a copy of the dataset


simulated_data <- complete_data

# Simulate Missing Values (5% Missing in PM2.5)


set.seed(123)
missing_indices <- sample(1:nrow(simulated_data), size = 0.05 * nrow(simulated_data))

# Introduce Missing Values in PM2.5


simulated_data$`PM2.5`[missing_indices] <- NA

# Apply Mean Imputation


mean_imputed <- simulated_data
mean_value <- mean(mean_imputed$`PM2.5`, na.rm = TRUE)
mean_imputed$`PM2.5`[is.na(mean_imputed$`PM2.5`)] <- mean_value

# Calculate MAE
MAE <- mean(abs(complete_data$`PM2.5` - mean_imputed$`PM2.5`), na.rm = TRUE)
cat("Mean Absolute Error (MAE):", MAE, "\n")

} else {
cat("Error: Dataset 'complete_data' is empty or not loaded properly.\n")
}
# Check if complete_data exists and is not empty
if (exists("complete_data") && nrow(complete_data) > 0) {

# Create a copy of the dataset


simulated_data <- complete_data

# Simulate Missing Values (15% Missing in PM2.5)


set.seed(123)
missing_indices <- sample(1:nrow(simulated_data), size = 0.15 * nrow(simulated_data))

# Introduce Missing Values in PM2.5


simulated_data$`PM2.5`[missing_indices] <- NA

# Apply Mean Imputation


mean_imputed <- simulated_data
mean_value <- mean(mean_imputed$`PM2.5`, na.rm = TRUE)
mean_imputed$`PM2.5`[is.na(mean_imputed$`PM2.5`)] <- mean_value

# Calculate MAE
MAE <- mean(abs(complete_data$`PM2.5` - mean_imputed$`PM2.5`), na.rm = TRUE)
cat("Mean Absolute Error (MAE):", MAE, "\n")

} else {
cat("Error: Dataset 'complete_data' is empty or not loaded properly.\n")
}
# Check if complete_data exists and is not empty
if (exists("complete_data") && nrow(complete_data) > 0) {

# Create a copy of the dataset


simulated_data <- complete_data

# Simulate Missing Values (20% Missing in PM2.5)


set.seed(123)
missing_indices <- sample(1:nrow(simulated_data), size = 0.2 * nrow(simulated_data))
# Introduce Missing Values in PM2.5
simulated_data$`PM2.5`[missing_indices] <- NA

# Apply Mean Imputation


mean_imputed <- simulated_data
mean_value <- mean(mean_imputed$`PM2.5`, na.rm = TRUE)
mean_imputed$`PM2.5`[is.na(mean_imputed$`PM2.5`)] <- mean_value

# Calculate MAE
MAE <- mean(abs(complete_data$`PM2.5` - mean_imputed$`PM2.5`), na.rm = TRUE)
cat("Mean Absolute Error (MAE):", MAE, "\n")

} else {
cat("Error: Dataset 'complete_data' is empty or not loaded properly.\n")
}

# 17/12/24

original_data <- Worli_Mumbai_PM2_5


complete_data <- na.omit(original_data)
missing_percent <- colSums(is.na(original_data)) / nrow(original_data) * 100
missing_percent
cat("Number of complete cases:", nrow(complete_data))

# Verify the number of observations


if(nrow(complete_data) == 7422) {
cat("The complete dataset contains 7422 observations. Ready for simulation!\n")
} else {
cat("The complete dataset has", nrow(complete_data), "observations. Check for missing values.\n")
}

simulate_missing <- function(complete_data, percent) {


set.seed(123) # For reproducibility

# Calculate Number of Missing Values


n_missing <- floor(percent * nrow(complete_data))

# Generate Random Missing Indices


missing_indices <- sample(1:nrow(complete_data), n_missing)

# Create Simulated Data


simulated_data <- complete_data
simulated_data$`PM2.5`[missing_indices] <- NA

return(simulated_data)
}
# Function for Median Imputation
median_impute <- function(simulated_data) {
median_imputed <- simulated_data
median_value <- median(median_imputed$`PM2.5`, na.rm = TRUE)
median_imputed$`PM2.5`[is.na(median_imputed$`PM2.5`)] <- median_value
return(median_imputed)
}
# Function to Calculate MAE
calculate_MAE <- function(original_data, imputed_data) {
MAE <- mean(abs(original_data$`PM2.5` - imputed_data$`PM2.5`), na.rm = TRUE)
return(MAE)
}

# Define Missing Percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)
mae_results <- sapply(missing_percents, function(p) {
simulated_data <- simulate_missing(complete_data, p)
imputed_data <- median_impute(simulated_data)
calculate_MAE(complete_data, imputed_data)
})

# Print Results
for (i in 1:length(missing_percents)) {
cat("MAE for", missing_percents[i] * 100, "% Missing Data:", mae_results[i], "\n")
}

# Function to calculate RMSE


calculate_RMSE <- function(original_data, imputed_data) {
rmse <- sqrt(mean((original_data$`PM2.5` - imputed_data$`PM2.5`)^2, na.rm = TRUE))
return(rmse)
}

# Define missing percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)

# Store RMSE results


rmse_results_median <- sapply(missing_percents, function(p) {
# Simulate missing data
simulated_data <- simulate_missing(complete_data, p)

# Perform median imputation


imputed_data <- median_impute(simulated_data)

# Calculate RMSE
rmse <- calculate_RMSE(complete_data, imputed_data)
return(rmse)
})

# Print RMSE Results


for (i in 1:length(missing_percents)) {
cat("RMSE for", missing_percents[i] * 100, "% Missing Data (Median Imputation):",
rmse_results_median[i], "\n")
}
# Function to calculate R^2
calculate_R2 <- function(original_data, imputed_data) {
ss_total <- sum((original_data$`PM2.5` - mean(original_data$`PM2.5`))^2, na.rm = TRUE)
ss_residual <- sum((original_data$`PM2.5` - imputed_data$`PM2.5`)^2, na.rm = TRUE)
R2 <- 1 - (ss_residual / ss_total)
return(R2)
}

# Define missing percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)

# Store R^2 results


R2_results_median <- sapply(missing_percents, function(p) {
# Simulate missing data
simulated_data <- simulate_missing(complete_data, p)

# Perform median imputation


imputed_data <- median_impute(simulated_data)

# Calculate R^2
R2 <- calculate_R2(complete_data, imputed_data)
return(R2)
})

# Print R^2 Results


for (i in 1:length(missing_percents)) {
cat("R^2 for", missing_percents[i] * 100, "% Missing Data (Median Imputation):", R2_results_median[i],
"\n")
}

# Function to calculate Agreement Index


calculate_agreement_index <- function(original_data, imputed_data) {
y <- original_data$`PM2.5`
y_hat <- imputed_data$`PM2.5`
y_mean <- mean(y, na.rm = TRUE)

numerator <- sum((y - y_hat)^2, na.rm = TRUE)


denominator <- sum((abs(y - y_mean) + abs(y_hat - y_mean))^2, na.rm = TRUE)

d <- 1 - (numerator / denominator)


return(d)
}

# Define missing percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)

# Store Agreement Index results


agreement_results_median <- sapply(missing_percents, function(p) {
# Simulate missing data
simulated_data <- simulate_missing(complete_data, p)
# Perform median imputation
imputed_data <- median_impute(simulated_data)

# Calculate Agreement Index


d <- calculate_agreement_index(complete_data, imputed_data)
return(d)
})

# Print Agreement Index Results


for (i in 1:length(missing_percents)) {
cat("Agreement Index for", missing_percents[i] * 100, "% Missing Data (Median Imputation):",
agreement_results_median[i], "\n")
}

# Function for Mean Imputation


mean_impute <- function(simulated_data) {
mean_imputed <- simulated_data
mean_value <- mean(mean_imputed$`PM2.5`, na.rm = TRUE)
mean_imputed$`PM2.5`[is.na(mean_imputed$`PM2.5`)] <- mean_value
return(mean_imputed)
}
# Function to Calculate MAE
calculate_MAE <- function(original_data, imputed_data) {
MAE <- mean(abs(original_data$`PM2.5` - imputed_data$`PM2.5`), na.rm = TRUE)
return(MAE)
}

# Define Missing Percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)
mae_results <- sapply(missing_percents, function(p) {
simulated_data <- simulate_missing(complete_data, p)
imputed_data <- mean_impute(simulated_data)
calculate_MAE(complete_data, imputed_data)
})

# Print Results
for (i in 1:length(missing_percents)) {
cat("MAE for", missing_percents[i] * 100, "% Missing Data:", mae_results[i], "\n")
}

# Function to calculate RMSE


calculate_RMSE <- function(original_data, imputed_data) {
rmse <- sqrt(mean((original_data$`PM2.5` - imputed_data$`PM2.5`)^2, na.rm = TRUE))
return(rmse)
}

# Missing percentages
missing_percents <- c(0.05, 0.10, 0.15, 0.20)

# Store RMSE results


rmse_results <- sapply(missing_percents, function(p) {
# Simulate missing data
simulated_data <- simulate_missing(complete_data, p)

# Perform mean imputation


imputed_data <- mean_impute(simulated_data)

# Calculate RMSE
rmse <- calculate_RMSE(complete_data, imputed_data)
return(rmse)
})

# Print RMSE Results


for (i in 1:length(missing_percents)) {
cat("RMSE for", missing_percents[i] * 100, "% Missing Data:", rmse_results[i], "\n")
}

# Function to calculate R^2


calculate_R2 <- function(original_data, imputed_data) {
ss_total <- sum((original_data$`PM2.5` - mean(original_data$`PM2.5`))^2, na.rm = TRUE)
ss_residual <- sum((original_data$`PM2.5` - imputed_data$`PM2.5`)^2, na.rm = TRUE)
R2 <- 1 - (ss_residual / ss_total)
return(R2)
}

# Define missing percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)

# Store R^2 results


R2_results_mean <- sapply(missing_percents, function(p) {
# Simulate missing data
simulated_data <- simulate_missing(complete_data, p)

# Perform mean imputation


imputed_data <- mean_impute(simulated_data)

# Calculate R^2
R2 <- calculate_R2(complete_data, imputed_data)
return(R2)
})

# Print R^2 Results


for (i in 1:length(missing_percents)) {
cat("R^2 for", missing_percents[i] * 100, "% Missing Data (Mean Imputation):", R2_results_mean[i], "\n")
}

# Function to calculate Agreement Index


calculate_agreement_index <- function(original_data, imputed_data) {
y <- original_data$`PM2.5`
y_hat <- imputed_data$`PM2.5`
y_mean <- mean(y, na.rm = TRUE)
numerator <- sum((y - y_hat)^2, na.rm = TRUE)
denominator <- sum((abs(y - y_mean) + abs(y_hat - y_mean))^2, na.rm = TRUE)

d <- 1 - (numerator / denominator)


return(d)
}

# Define missing percentages


missing_percents <- c(0.05, 0.10, 0.15, 0.20)

# Store Agreement Index results


agreement_results <- sapply(missing_percents, function(p) {
# Simulate missing data
simulated_data <- simulate_missing(complete_data, p)

# Perform mean imputation


imputed_data <- mean_impute(simulated_data)

# Calculate Agreement Index


d <- calculate_agreement_index(complete_data, imputed_data)
return(d)
})

# Print Agreement Index Results


for (i in 1:length(missing_percents)) {
cat("Agreement Index for", missing_percents[i] * 100, "% Missing Data (Mean Imputation):",
agreement_results[i], "\n")
}

NOT SURE

# Function for kNN Imputation


knn_impute <- function(simulated_data, k = 5) {
imputed_data <- knnImputation(simulated_data, k = k)
return(imputed_data)
}
# Function to Calculate MAE
calculate_MAE <- function(original_data, imputed_data) {
MAE <- mean(abs(original_data$`PM2.5` - imputed_data$`PM2.5`), na.rm = TRUE)
return(MAE)
}

install.packages("DMwR2")
library(DMwR2)

# Perform kNN Imputation


knn_impute <- function(data, k) {
imputed_data <- knnImputation(data, k = k)
return(imputed_data)
}

# Install and Load VIM


install.packages("VIM")
library(VIM)

# Load required package


library(DMwR2)
set.seed(123) # For reproducibility

# Convert to a dataframe if not already


complete_data <- as.data.frame(complete_data)

# Obtain only complete cases


complete_data <- na.omit(complete_data)

# Introduce 20% missing values randomly


missing_data <- complete_data
n <- nrow(missing_data) * ncol(missing_data)
num_missing <- round(0.20 * n)

# Randomly select indices to introduce NAs


indices <- sample(1:n, num_missing, replace = FALSE)

# Set the selected indices to NA


missing_data[indices] <- NA

# Check for missing values


sum(is.na(missing_data)) # Should be 20% of total elements
# Perform kNN Imputation
imputed_data <- kNN(simulated_data, variable = "PM2.5", k = 5)

# Define Missing Percentages


missing_percents <- c(0.05, 0.10, 0.15)
k_value <- 5 # You can change this to try different values

mae_results <- sapply(missing_percents, function(p) {


simulated_data <- simulate_missing(complete_data, p)
imputed_data <- knn_impute(simulated_data, k = 5)
calculate_MAE(complete_data, imputed_data)
})

# Print Results
for (i in 1:length(missing_percents)) {
cat("MAE for", missing_percents[i] * 100, "% Missing Data with k =", k_value, ":", mae_results[i], "\n")
}

You might also like