0% found this document useful (0 votes)
4 views11 pages

Hierar Varam

The document outlines a hierarchical clustering analysis performed on a dataset of fulfillment centers, focusing on operational area and city code. It includes data loading, inspection, missing value handling, outlier detection, and visualization through histograms, boxplots, and a dendrogram. The final output summarizes the clusters formed, detailing their mean and median operational areas along with the count of centers in each cluster.

Uploaded by

akanaguhari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views11 pages

Hierar Varam

The document outlines a hierarchical clustering analysis performed on a dataset of fulfillment centers, focusing on operational area and city code. It includes data loading, inspection, missing value handling, outlier detection, and visualization through histograms, boxplots, and a dendrogram. The final output summarizes the clusters formed, detailing their mean and median operational areas along with the count of centers in each cluster.

Uploaded by

akanaguhari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 11

HIERARCHICAL CLUSTERING

24CSEG034

2025-04-04
# Load Required Libraries
library(MASS)
library(dplyr)

##
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':


##
## select

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

library(cluster)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at


https://goo.gl/ve3WBa

library(ggplot2)

# Load and Inspect Data


df <- read.csv(file.choose(), header = TRUE)
head(df)

## center_id city_code region_code center_type op_area


## 1 11 679 56 TYPE_A 3.7
## 2 13 590 56 TYPE_B 6.7
## 3 124 590 56 TYPE_C 4.0
## 4 66 648 34 TYPE_A 4.1
## 5 94 632 34 TYPE_C 3.6
## 6 64 553 77 TYPE_A 4.4

tail(df)

## center_id city_code region_code center_type op_area


## 72 42 561 77 TYPE_B 3.9
## 73 53 590 56 TYPE_A 3.8
## 74 30 604 56 TYPE_A 3.5
## 75 76 614 85 TYPE_A 3.0
## 76 68 676 34 TYPE_B 4.1
## 77 51 638 56 TYPE_A 7.0

summary(df)

## center_id city_code region_code center_type

## Min. : 10.00 Min. :456.0 Min. :23.00 Length:77

## 1st Qu.: 50.00 1st Qu.:553.0 1st Qu.:34.00 Class :character

## Median : 77.00 Median :596.0 Median :56.00 Mode :character

## Mean : 83.14 Mean :600.7 Mean :56.49

## 3rd Qu.:110.00 3rd Qu.:651.0 3rd Qu.:77.00

## Max. :186.00 Max. :713.0 Max. :93.00

## op_area
## Min. :0.900
## 1st Qu.:3.500
## Median :3.900
## Mean :3.986
## 3rd Qu.:4.400
## Max. :7.000

str(df)

## 'data.frame': 77 obs. of 5 variables:


## $ center_id : int 11 13 124 66 94 64 129 139 88 143 ...
## $ city_code : int 679 590 590 648 632 553 593 693 526 562 ...
## $ region_code: int 56 56 56 34 34 77 77 34 34 77 ...
## $ center_type: chr "TYPE_A" "TYPE_B" "TYPE_C" "TYPE_A" ...
## $ op_area : num 3.7 6.7 4 4.1 3.6 4.4 3.9 2.8 4.1 3.8 ...

# 1. Identify Missing Values


cat("Missing values in each column:\n")

## Missing values in each column:

print(colSums(is.na(df)))

## center_id city_code region_code center_type op_area


## 0 0 0 0 0

# Handle missing values (option: remove rows with NA)


df <- na.omit(df)
# 2. Outlier Detection and Handling (Using IQR Method for 'op_area')
Q1 <- quantile(df$op_area, 0.25)
Q3 <- quantile(df$op_area, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Check outliers
outliers <- df %>% filter(op_area < lower_bound | op_area >
upper_bound)
cat("Number of outliers in 'op_area':", nrow(outliers), "\n")

## Number of outliers in 'op_area': 9

# Optionally remove outliers


df <- df %>% filter(op_area >= lower_bound & op_area <= upper_bound)

# Histograms
hist(df$op_area, breaks = 30, col = "blue", main = "Distribution of
Operational Area", xlab = "Operational Area")

hist(df$region_code, breaks = 30, col = "orange", main = "Distribution


of Region Code", xlab = "Region Code")
hist(df$center_id, breaks = 30, col = "black", main = "Distribution of
Center ID", xlab = "Center ID")
hist(df$city_code, breaks = 30, col = "skyblue", main = "Distribution
of City Code", xlab = "City Code")

# Boxplots
boxplot(df$op_area, col = "green", main = "Boxplot of Operational
Area", xlab = "Operational Area")
boxplot(df$center_id, col = "blue", main = "Boxplot of Center ID",
xlab = "Center ID")
boxplot(df$city_code, col = "yellow", main = "Boxplot of City Code",
xlab = "City Code")

boxplot(df$region_code, col = "orange", main = "Boxplot of Region


Code", xlab = "Region Code")
# Scatter Plot
plot(df$city_code, df$op_area, col = "purple", pch = 19,
main = "Operational Area vs City Code",
xlab = "City Code", ylab = "Operational Area")
# Data Filtering & Summary
mean_op_area <- mean(df$op_area, na.rm = TRUE)
median_op_area <- median(df$op_area, na.rm = TRUE)

cat("Mean Operational Area:", mean_op_area, "\n")

## Mean Operational Area: 3.913235

cat("Median Operational Area:", median_op_area, "\n")

## Median Operational Area: 3.9

high_demand_centers <- df %>% filter(op_area > mean_op_area)


low_demand_centers <- df %>% filter(op_area < median_op_area)

head(high_demand_centers)

## center_id city_code region_code center_type op_area


## 1 124 590 56 TYPE_C 4.0
## 2 66 648 34 TYPE_A 4.1
## 3 64 553 77 TYPE_A 4.4
## 4 88 526 34 TYPE_A 4.1
## 5 86 699 85 TYPE_C 4.0
## 6 152 576 34 TYPE_B 4.0

head(low_demand_centers)
## center_id city_code region_code center_type op_area
## 1 11 679 56 TYPE_A 3.7
## 2 94 632 34 TYPE_C 3.6
## 3 139 693 34 TYPE_C 2.8
## 4 143 562 77 TYPE_B 3.8
## 5 101 699 85 TYPE_C 2.8
## 6 32 526 34 TYPE_A 3.8

library(MASS)
library(dplyr)
# Hierarchical Clustering
cluster_data <- dplyr::select(df, op_area, city_code)
scaled_data <- scale(cluster_data)
dist_matrix <- dist(scaled_data, method = "euclidean")
hc <- hclust(dist_matrix, method = "ward.D2")

# Dendrogram
plot(hc, main = "Dendrogram of Fulfillment Centers", xlab = "", sub =
"")

# Assign Clusters
k <- 4
clusters <- cutree(hc, k = k)
df$Cluster <- as.factor(clusters)

# Visualize Clusters
fviz_cluster(list(data = scaled_data, cluster = clusters),
main = "Hierarchical Clustering of Fulfillment Centers",
palette = "jco",
ggtheme = theme_minimal())

# Cluster Summary
cluster_summary <- df %>%
group_by(Cluster) %>%
summarize(
mean_op_area = mean(op_area, na.rm = TRUE),
median_op_area = median(op_area, na.rm = TRUE),
count = n()
)

print(cluster_summary)

## # A tibble: 4 × 4
## Cluster mean_op_area median_op_area count
## <fct> <dbl> <dbl> <int>
## 1 1 3.67 3.8 30
## 2 2 4.69 4.6 19
## 3 3 4 4 11
## 4 4 2.85 2.85 8

You might also like