0% found this document useful (0 votes)
2 views

R programming and ipr

The document outlines methods for handling outliers in datasets using box plots, histograms, and bar charts. It demonstrates how to compute quartiles, identify, and remove outliers from the 'mtcars' and 'airquality' datasets, along with visualizations before and after cleaning the data. Additionally, it includes steps for imputing missing values and comparing original and cleaned data distributions.

Uploaded by

Shenbaga Kumar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

R programming and ipr

The document outlines methods for handling outliers in datasets using box plots, histograms, and bar charts. It demonstrates how to compute quartiles, identify, and remove outliers from the 'mtcars' and 'airquality' datasets, along with visualizations before and after cleaning the data. Additionally, it includes steps for imputing missing values and comparing original and cleaned data distributions.

Uploaded by

Shenbaga Kumar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

HANDLING OUTLIERS

Using Box Plot


# Load dataset
data <- mtcars$hp

# Create Box Plot


boxplot(data, main="Boxplot of Horsepower (hp) - Before Removing Outliers",
col="lightblue", horizontal=TRUE)

# Compute Q1, Q3, and IQR


Q1 <- quantile(data, 0.25) # 25th percentile
Q3 <- quantile(data, 0.75) # 75th percentile
IQR <- Q3 - Q1 # Interquartile Range

# Define lower and upper bounds


lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
print(Q1)
print(Q3)
print(IQR)
print(lower_bound)
print(upper_bound)

# Identify outliers
outliers <- data[data < lower_bound | data > upper_bound]
print(outliers)

# Remove Outliers
data_clean <- data[data >= lower_bound & data <= upper_bound]

# Box Plot After Outlier Removal


boxplot(data_clean, main="Boxplot of Horsepower (hp) - After Removing Outliers",
col="lightgreen", horizontal=TRUE)
Output
> print(Q1) > print(IQR) > print(upper_bound)
25% 75% 75%
96.5 83.5 305.25
> print(Q3) > print(lower_bound) > print(outliers)
75% 25% [1] 335
180 -28.75
Using histogram
# Load dataset
data <- mtcars$hp

# Create Histogram
hist(data, main="Histogram of Horsepower (hp) - Before Handling Outliers",
col="lightblue", xlab="Horsepower (hp)", border="black", breaks=10)

# Compute Q1, Q3, and IQR


Q1 <- quantile(data, 0.25) # 25th percentile
Q3 <- quantile(data, 0.75) # 75th percentile
IQR <- Q3 - Q1 # Interquartile Range

# Define lower and upper bounds


lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
print(Q1)
print(Q3)
print(IQR)
print(lower_bound)
print(upper_bound)

# Identify outliers
outliers <- data[data < lower_bound | data > upper_bound]
print(outliers)

# Remove Outliers
data_clean <- data[data >= lower_bound & data <= upper_bound]

# Create Histogram After Removing Outliers


hist(data_clean, main="Histogram of Horsepower (hp) - After Removing Outliers",
col="lightgreen", xlab="Horsepower (hp)", border="black", breaks=10)
Output
> print(Q1) > print(IQR) > print(upper_bound)
25% 75% 75%
96.5 83.5 305.25
> print(Q3) > print(lower_bound) > print(outliers)
75% 25% [1] 335
180 -28.75
Using barplot
# Load the built-in airquality dataset
data("airquality")

# 1. Check for Missing Values in Each Column


cat("Missing Values by Column:\n")
print(colSums(is.na(airquality)))

# 2. Visualize Gaps and Distribution Using Bar Chart


# Count the frequency of each Ozone value (including NA as a category)
ozone_data <- airquality$Ozone
ozone_data[is.na(ozone_data)] <- "Missing"

# Bar chart to show distribution and gaps in Ozone levels


barplot(table(ozone_data),
main = "Ozone Levels with Gaps",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "lightblue",
border = "black",
las = 2, # Make x-axis labels vertical
cex.names = 0.7) # Reduce label size for readability

# 3. Detect Outliers Using Frequency Analysis


# Remove NAs for outlier detection
ozone_data_clean <- na.omit(airquality$Ozone)

# Count the frequency of each value


ozone_freq <- table(ozone_data_clean)
cat("\nFrequency of Ozone Levels:\n")
print(ozone_freq)

# Identify outliers as values with frequency of 1 (rare occurrences)


ozone_outliers <- names(ozone_freq[ozone_freq == 1])
cat("\nDetected Outliers (Rare Values):", ozone_outliers, "\n")

# Bar chart showing frequency distribution (for outlier spotting)


barplot(ozone_freq,
main = "Frequency Distribution of Ozone Levels",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "salmon",
border = "black",
las = 2,
cex.names = 0.7)

# 4. Clean Data: Impute Missing Values with Median


# Impute missing values with median
median_value <- median(ozone_data_clean, na.rm = TRUE)
imputed_ozone <- ifelse(is.na(airquality$Ozone), median_value, airquality$Ozone)

# 5. Compare Before and After Cleaning Using Bar Chart


# Frequency of Original Data (with gaps and outliers)
ozone_freq_original <- table(airquality$Ozone)

# Frequency of Cleaned Data (without gaps and outliers)


ozone_freq_cleaned <- table(imputed_ozone)

# Set layout for side-by-side plots


par(mfrow = c(1, 2))

# Original Data (with gaps and outliers)


barplot(ozone_freq_original,
main = "Original Ozone Levels",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "coral",
border = "black",
las = 2,
cex.names = 0.7)

# Cleaned Data (without gaps and outliers)


barplot(ozone_freq_cleaned,
main = "Cleaned & Imputed Ozone Levels",
xlab = "Ozone Levels",
ylab = "Frequency",
col = "lightgreen",
border = "black",
las = 2, cex.names = 0.7)

# Reset layout
par(mfrow = c(1, 1))
Output

You might also like