0% found this document useful (0 votes)
7 views

Week2 R Program

Week2 R Program

Uploaded by

pz253
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views

Week2 R Program

Week2 R Program

Uploaded by

pz253
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

R program:

library(ggplot2)

# Step 1: Read and Examine Data


# Reading data from CSV
data <- read.csv("HMEQ_Loss.csv", na.strings = "")

# Display structure of the data


str(data)

# Display summary statistics


summary(data)

# Display first six records


head(data)

# Step 2: Box-Whisker Plots for numeric variables


create_boxplots <- function(data) {
# Get numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
numeric_cols <- numeric_cols[numeric_cols != "TARGET_BAD_FLAG"]

# Set up plotting area for multiple plots


par(mfrow = c(3, 3)) # Adjust based on number of variables

# Create box plots for each numeric variable


for(col in numeric_cols) {
boxplot(data[[col]] ~ data$TARGET_BAD_FLAG,
main = paste("Distribution of", col),
xlab = "Loan Status (0 = Good, 1 = Bad)",
ylab = col,
col = c("lightblue", "lightgreen"))
}

# Reset plotting area


par(mfrow = c(1, 1))
}

# Step 3: Create Histogram with Density Line


create_histogram <- function(data, variable) {
# Create histogram
hist(data[[variable]],
freq = FALSE,
breaks = 30,
main = paste("Distribution of", variable),
xlab = variable,
col = "lightblue",
border = "white")

# Add density line


lines(density(data[[variable]], na.rm = TRUE),
col = "red",
lwd = 2)
}

# Step 4: Handle Missing Values


impute_data <- function(data) {
# Create copy of original data
imputed_data <- data

# Handle TARGET variables


imputed_data$TARGET_BAD_FLAG[is.na(imputed_data$TARGET_BAD_FLAG)] <- 0
imputed_data$TARGET_LOSS_AMT[is.na(imputed_data$TARGET_LOSS_AMT)] <- 0

# Get numeric columns for imputation (excluding TARGET variables)


numeric_cols <- names(data)[sapply(data, is.numeric)]
numeric_cols <- numeric_cols[!numeric_cols %in% c("TARGET_BAD_FLAG",
"TARGET_LOSS_AMT")]

# Complex imputation for numeric variables


for(col in numeric_cols) {
# Create missing indicator
imputed_data[paste0("M_", col)] <- ifelse(is.na(data[[col]]), 1, 0)

# Perform imputation using median by TARGET_BAD_FLAG group


imputed_values <- tapply(data[[col]], data$TARGET_BAD_FLAG, median, na.rm = TRUE)

# Create new imputed column


imputed_data[paste0("IMP_", col)] <- data[[col]]

# Impute missing values by group


for(flag in c(0, 1)) {
mask <- is.na(imputed_data[paste0("IMP_", col)]) & imputed_data$TARGET_BAD_FLAG ==
flag
imputed_data[mask, paste0("IMP_", col)] <- imputed_values[as.character(flag)]
}
# Remove original column
imputed_data[[col]] <- NULL
}

return(imputed_data)
}

# Step 5: One Hot Encoding


one_hot_encode <- function(data) {
# Identify character columns
char_cols <- names(data)[sapply(data, is.character)]

# Create dummy variables for each character column


for(col in char_cols) {
# Get unique values
unique_values <- unique(data[[col]][!is.na(data[[col]])])

# Create dummy variables


for(value in unique_values) {
new_col_name <- paste0(col, "_", make.names(value))
data[[new_col_name]] <- ifelse(data[[col]] == value, 1, 0)
}

# Remove original column


data[[col]] <- NULL
}

return(data)
}

# Main execution
main <- function() {
# Read data
cat("Reading data...\n")
data <- read.csv("HMEQ_Loss.csv", na.strings = "")

# Step 1: Examine Data


cat("\nData Structure:\n")
str(data)

cat("\nData Summary:\n")
print(summary(data))

cat("\nFirst Six Records:\n")


print(head(data))

# Step 2: Create Box Plots


cat("\nCreating box plots...\n")
create_boxplots(data)

# Step 3: Create Histogram for LOAN amount


cat("\nCreating histogram for LOAN amount...\n")
create_histogram(data, "LOAN")

# Step 4: Handle Missing Values


cat("\nHandling missing values...\n")
imputed_data <- impute_data(data)

cat("\nSummary after imputation:\n")


print(summary(imputed_data))

# Print sum of missing value indicators


m_cols <- names(imputed_data)[startsWith(names(imputed_data), "M_")]
cat("\nNumber of imputed values per variable:\n")
print(colSums(imputed_data[m_cols]))

# Step 5: One Hot Encoding


cat("\nPerforming one-hot encoding...\n")
final_data <- one_hot_encode(imputed_data)

cat("\nFinal Data Structure:\n")


print(str(final_data))

return(final_data)
}

# Run the analysis

You might also like