0% found this document useful (0 votes)

4 views11 pages

Hierar Varam

The document outlines a hierarchical clustering analysis performed on a dataset of fulfillment centers, focusing on operational area and city code. It includes data loading, inspection, missing value handling, outlier detection, and visualization through histograms, boxplots, and a dendrogram. The final output summarizes the clusters formed, detailing their mean and median operational areas along with the count of centers in each cluster.

Uploaded by

akanaguhari

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

4 views11 pages

Hierar Varam

Uploaded by

akanaguhari

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 11

HIERARCHICAL CLUSTERING

24CSEG034

2025-04-04
# Load Required Libraries
library(MASS)
library(dplyr)

##
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':

##
## select

## The following objects are masked from 'package:stats':

##
## filter, lag

## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union

library(cluster)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at

https://goo.gl/ve3WBa

library(ggplot2)

# Load and Inspect Data

df <- read.csv(file.choose(), header = TRUE)
head(df)

## center_id city_code region_code center_type op_area

## 1 11 679 56 TYPE_A 3.7
## 2 13 590 56 TYPE_B 6.7
## 3 124 590 56 TYPE_C 4.0
## 4 66 648 34 TYPE_A 4.1
## 5 94 632 34 TYPE_C 3.6
## 6 64 553 77 TYPE_A 4.4

tail(df)

## center_id city_code region_code center_type op_area

## 72 42 561 77 TYPE_B 3.9
## 73 53 590 56 TYPE_A 3.8
## 74 30 604 56 TYPE_A 3.5
## 75 76 614 85 TYPE_A 3.0
## 76 68 676 34 TYPE_B 4.1
## 77 51 638 56 TYPE_A 7.0

summary(df)

## center_id city_code region_code center_type

## Min. : 10.00 Min. :456.0 Min. :23.00 Length:77

## 1st Qu.: 50.00 1st Qu.:553.0 1st Qu.:34.00 Class :character

## Median : 77.00 Median :596.0 Median :56.00 Mode :character

## Mean : 83.14 Mean :600.7 Mean :56.49

## 3rd Qu.:110.00 3rd Qu.:651.0 3rd Qu.:77.00

## Max. :186.00 Max. :713.0 Max. :93.00

## op_area
## Min. :0.900
## 1st Qu.:3.500
## Median :3.900
## Mean :3.986
## 3rd Qu.:4.400
## Max. :7.000

str(df)

## 'data.frame': 77 obs. of 5 variables:

## $ center_id : int 11 13 124 66 94 64 129 139 88 143 ...
## $ city_code : int 679 590 590 648 632 553 593 693 526 562 ...
## $ region_code: int 56 56 56 34 34 77 77 34 34 77 ...
## $ center_type: chr "TYPE_A" "TYPE_B" "TYPE_C" "TYPE_A" ...
## $ op_area : num 3.7 6.7 4 4.1 3.6 4.4 3.9 2.8 4.1 3.8 ...

# 1. Identify Missing Values

cat("Missing values in each column:\n")

## Missing values in each column:

print(colSums(is.na(df)))

## center_id city_code region_code center_type op_area

## 0 0 0 0 0

# Handle missing values (option: remove rows with NA)

df <- na.omit(df)
# 2. Outlier Detection and Handling (Using IQR Method for 'op_area')
Q1 <- quantile(df$op_area, 0.25)
Q3 <- quantile(df$op_area, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Check outliers
outliers <- df %>% filter(op_area < lower_bound | op_area >
upper_bound)
cat("Number of outliers in 'op_area':", nrow(outliers), "\n")

## Number of outliers in 'op_area': 9

# Optionally remove outliers

df <- df %>% filter(op_area >= lower_bound & op_area <= upper_bound)

# Histograms
hist(df$op_area, breaks = 30, col = "blue", main = "Distribution of
Operational Area", xlab = "Operational Area")

hist(df$region_code, breaks = 30, col = "orange", main = "Distribution

of Region Code", xlab = "Region Code")
hist(df$center_id, breaks = 30, col = "black", main = "Distribution of
Center ID", xlab = "Center ID")
hist(df$city_code, breaks = 30, col = "skyblue", main = "Distribution
of City Code", xlab = "City Code")

# Boxplots
boxplot(df$op_area, col = "green", main = "Boxplot of Operational
Area", xlab = "Operational Area")
boxplot(df$center_id, col = "blue", main = "Boxplot of Center ID",
xlab = "Center ID")
boxplot(df$city_code, col = "yellow", main = "Boxplot of City Code",
xlab = "City Code")

boxplot(df$region_code, col = "orange", main = "Boxplot of Region

Code", xlab = "Region Code")
# Scatter Plot
plot(df$city_code, df$op_area, col = "purple", pch = 19,
main = "Operational Area vs City Code",
xlab = "City Code", ylab = "Operational Area")
# Data Filtering & Summary
mean_op_area <- mean(df$op_area, na.rm = TRUE)
median_op_area <- median(df$op_area, na.rm = TRUE)

cat("Mean Operational Area:", mean_op_area, "\n")

## Mean Operational Area: 3.913235

cat("Median Operational Area:", median_op_area, "\n")

## Median Operational Area: 3.9

high_demand_centers <- df %>% filter(op_area > mean_op_area)

low_demand_centers <- df %>% filter(op_area < median_op_area)

head(high_demand_centers)

## center_id city_code region_code center_type op_area

## 1 124 590 56 TYPE_C 4.0
## 2 66 648 34 TYPE_A 4.1
## 3 64 553 77 TYPE_A 4.4
## 4 88 526 34 TYPE_A 4.1
## 5 86 699 85 TYPE_C 4.0
## 6 152 576 34 TYPE_B 4.0

head(low_demand_centers)
## center_id city_code region_code center_type op_area
## 1 11 679 56 TYPE_A 3.7
## 2 94 632 34 TYPE_C 3.6
## 3 139 693 34 TYPE_C 2.8
## 4 143 562 77 TYPE_B 3.8
## 5 101 699 85 TYPE_C 2.8
## 6 32 526 34 TYPE_A 3.8

library(MASS)
library(dplyr)
# Hierarchical Clustering
cluster_data <- dplyr::select(df, op_area, city_code)
scaled_data <- scale(cluster_data)
dist_matrix <- dist(scaled_data, method = "euclidean")
hc <- hclust(dist_matrix, method = "ward.D2")

# Dendrogram
plot(hc, main = "Dendrogram of Fulfillment Centers", xlab = "", sub =
"")

# Assign Clusters
k <- 4
clusters <- cutree(hc, k = k)
df$Cluster <- as.factor(clusters)

# Visualize Clusters
fviz_cluster(list(data = scaled_data, cluster = clusters),
main = "Hierarchical Clustering of Fulfillment Centers",
palette = "jco",
ggtheme = theme_minimal())

# Cluster Summary
cluster_summary <- df %>%
group_by(Cluster) %>%
summarize(
mean_op_area = mean(op_area, na.rm = TRUE),
median_op_area = median(op_area, na.rm = TRUE),
count = n()
)

print(cluster_summary)

## # A tibble: 4 × 4
## Cluster mean_op_area median_op_area count
## <fct> <dbl> <dbl> <int>
## 1 1 3.67 3.8 30
## 2 2 4.69 4.6 19
## 3 3 4 4 11
## 4 4 2.85 2.85 8

Modern Statistics With R
100% (3)
Modern Statistics With R
580 pages
EDAV
No ratings yet
EDAV
218 pages
Manual of Applied Spatial Ecology
No ratings yet
Manual of Applied Spatial Ecology
190 pages
R Practicals (2007 Version)
No ratings yet
R Practicals (2007 Version)
15 pages
R Programming Language Notes
No ratings yet
R Programming Language Notes
8 pages
Dplyr Manual
No ratings yet
Dplyr Manual
71 pages
Package Openair': April 12, 2017
No ratings yet
Package Openair': April 12, 2017
171 pages
Package Dismo': R Topics Documented
No ratings yet
Package Dismo': R Topics Documented
68 pages
Psych R Package
No ratings yet
Psych R Package
412 pages
R Project Master Health Data Science
No ratings yet
R Project Master Health Data Science
32 pages
ArunSrinivasanSatRdaysBudapest2016 PDF
No ratings yet
ArunSrinivasanSatRdaysBudapest2016 PDF
51 pages
R - Analysis
No ratings yet
R - Analysis
26 pages
R Note
No ratings yet
R Note
56 pages
Exploratory Data Analysis With R
No ratings yet
Exploratory Data Analysis With R
218 pages
Notes-US Census Data
No ratings yet
Notes-US Census Data
12 pages
WS3 Geographic
100% (1)
WS3 Geographic
18 pages
Summarizing Data
No ratings yet
Summarizing Data
20 pages
Important R Codes and Notes
No ratings yet
Important R Codes and Notes
13 pages
Exp - 2-EDA - CaliforniaData Set - HeatMap - PairPlot-checkpoint - Jupyter Notebook
No ratings yet
Exp - 2-EDA - CaliforniaData Set - HeatMap - PairPlot-checkpoint - Jupyter Notebook
12 pages
R Record-1
No ratings yet
R Record-1
53 pages
Reshape2 - R - Flexibly Reshape Data - A Reboot of The Reshape Package
No ratings yet
Reshape2 - R - Flexibly Reshape Data - A Reboot of The Reshape Package
14 pages
Book - Roger D Peng-Exploratory Data Analysis With R-Leanpub (2015) PDF
No ratings yet
Book - Roger D Peng-Exploratory Data Analysis With R-Leanpub (2015) PDF
125 pages
Srvyr R Package Documentation
No ratings yet
Srvyr R Package Documentation
45 pages
DALab Part-B BCU&BU
No ratings yet
DALab Part-B BCU&BU
12 pages
HW 1 Math 380 R Code
No ratings yet
HW 1 Math 380 R Code
4 pages
Book - Roger D Peng-Exploratory Data Analysis With R-Leanpub (2015) PDF
0% (1)
Book - Roger D Peng-Exploratory Data Analysis With R-Leanpub (2015) PDF
125 pages
Clusterig
No ratings yet
Clusterig
6 pages
R Functions
No ratings yet
R Functions
8 pages
SPPUML6
No ratings yet
SPPUML6
9 pages
ML Practical 4D
No ratings yet
ML Practical 4D
11 pages
Exdata
No ratings yet
Exdata
184 pages
Normialization Dataset
No ratings yet
Normialization Dataset
7 pages
Exploratory Data Analysis With R-Leanpub PDF
No ratings yet
Exploratory Data Analysis With R-Leanpub PDF
125 pages
Matrix, Dataframes, List
No ratings yet
Matrix, Dataframes, List
8 pages
R Programs 2024-2025
No ratings yet
R Programs 2024-2025
13 pages
DATAMINING
No ratings yet
DATAMINING
24 pages
As 2
No ratings yet
As 2
10 pages
ML Observation
No ratings yet
ML Observation
29 pages
Clustering Documentation Python Code
No ratings yet
Clustering Documentation Python Code
8 pages
Home Credit Data
No ratings yet
Home Credit Data
6 pages
R File Code
No ratings yet
R File Code
16 pages
Cluster R
No ratings yet
Cluster R
1 page
Ancova: R Markdown
No ratings yet
Ancova: R Markdown
6 pages
Exploratory Data Analysis With R PDF
No ratings yet
Exploratory Data Analysis With R PDF
125 pages
Project Template Notebook Ipynb 1
No ratings yet
Project Template Notebook Ipynb 1
23 pages
Air Quality Randomforest
No ratings yet
Air Quality Randomforest
5 pages
RSTUDIO
No ratings yet
RSTUDIO
44 pages
Sales Data Clustering
No ratings yet
Sales Data Clustering
15 pages
BDA Practical01
No ratings yet
BDA Practical01
5 pages
Schematic Model Manager User Guide
50% (2)
Schematic Model Manager User Guide
244 pages
Fds QB
No ratings yet
Fds QB
6 pages
Hard Disk Basic
100% (1)
Hard Disk Basic
27 pages
Welcome To The Learning Unit On: T24 Application Structure and Files
100% (2)
Welcome To The Learning Unit On: T24 Application Structure and Files
25 pages
ICT Paper 1
No ratings yet
ICT Paper 1
304 pages
(Ebook PDF) Business Statistics STAT150 A Custom Edition Instant Download
100% (2)
(Ebook PDF) Business Statistics STAT150 A Custom Edition Instant Download
55 pages
Jameel Ur Rehman 8604 1
No ratings yet
Jameel Ur Rehman 8604 1
18 pages
Ale HDP Organizational Placement2019
No ratings yet
Ale HDP Organizational Placement2019
18 pages
Change Management Guiding Principles: User Focused Minimize Impact On Productivity Build Credibility & Trust
No ratings yet
Change Management Guiding Principles: User Focused Minimize Impact On Productivity Build Credibility & Trust
9 pages
Palanca PR2 1J2J3
No ratings yet
Palanca PR2 1J2J3
21 pages
Sap Hana:: OLTP: Simple Queries Like INSERT, UPDATE, DELETE Etc
No ratings yet
Sap Hana:: OLTP: Simple Queries Like INSERT, UPDATE, DELETE Etc
6 pages
Learn Enough Command Line To Be Dangerous
No ratings yet
Learn Enough Command Line To Be Dangerous
3 pages
CS3492 Syllabus
No ratings yet
CS3492 Syllabus
2 pages
Hamming Code
0% (2)
Hamming Code
6 pages
Research 8 Grade 8 Melc 4 6 q4
No ratings yet
Research 8 Grade 8 Melc 4 6 q4
17 pages
Module 1 - Data Representation, and Data Structures-1
No ratings yet
Module 1 - Data Representation, and Data Structures-1
20 pages
Question Bank
No ratings yet
Question Bank
18 pages
TNM-Malawi-Wireless - CSFB Failures in LTE Network
No ratings yet
TNM-Malawi-Wireless - CSFB Failures in LTE Network
25 pages
Chainway Communication Protocol For CW-801 & 701
No ratings yet
Chainway Communication Protocol For CW-801 & 701
34 pages
3-PPT - Chapter 2 Concept of Quantative and Qualititive Research
No ratings yet
3-PPT - Chapter 2 Concept of Quantative and Qualititive Research
27 pages
Data Structure and Algorithms - Stack
No ratings yet
Data Structure and Algorithms - Stack
3 pages
CGMB 234: Multimedia Systems Design
No ratings yet
CGMB 234: Multimedia Systems Design
33 pages
Lab Report: Project - Water Pressure Rocket
No ratings yet
Lab Report: Project - Water Pressure Rocket
3 pages
Ms Access Lab
No ratings yet
Ms Access Lab
5 pages
Kuis 11
No ratings yet
Kuis 11
5 pages
PW 1 IMRAN DTK3A (New)
No ratings yet
PW 1 IMRAN DTK3A (New)
12 pages
Vss Cheat Sheet
No ratings yet
Vss Cheat Sheet
1 page
Replication of Views With SAP LT Replication Server: Projection View
No ratings yet
Replication of Views With SAP LT Replication Server: Projection View
10 pages
Internet and Web Programming - CSE3002 Digital Assignment - 1
No ratings yet
Internet and Web Programming - CSE3002 Digital Assignment - 1
11 pages
1Z0-047 Oracle Database SQL Certified Expert
No ratings yet
1Z0-047 Oracle Database SQL Certified Expert
8 pages
GS1 GLN Executive Summary
No ratings yet
GS1 GLN Executive Summary
1 page
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Computer Science, Career and Job
From Everand
Computer Science, Career and Job
Ramkrishna Ghosh
No ratings yet
Data Science Programming In Python
From Everand
Data Science Programming In Python
Anita Raichand
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
"C Programming for Beginners: A Step-by-Step Guide"
From Everand
"C Programming for Beginners: A Step-by-Step Guide"
Lov kush
No ratings yet
Basic Information About C language PDF
From Everand
Basic Information About C language PDF
Suraj Das
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
TensorFlow深度学习项目实战: Chinese Edition
From Everand
TensorFlow深度学习项目实战: Chinese Edition
Posts & Telecom Press
No ratings yet
Microsoft Visual Basic Interview Questions: Microsoft VB Certification Review
From Everand
Microsoft Visual Basic Interview Questions: Microsoft VB Certification Review
Equity Press
No ratings yet

Hierar Varam

Uploaded by

Hierar Varam

Uploaded by

HIERARCHICAL CLUSTERING

## The following object is masked from 'package:MASS':

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at

# Load and Inspect Data

## center_id city_code region_code center_type op_area

## center_id city_code region_code center_type op_area

## center_id city_code region_code center_type

## Min. : 10.00 Min. :456.0 Min. :23.00 Length:77

## 1st Qu.: 50.00 1st Qu.:553.0 1st Qu.:34.00 Class :character

## Median : 77.00 Median :596.0 Median :56.00 Mode :character

## Mean : 83.14 Mean :600.7 Mean :56.49

## 3rd Qu.:110.00 3rd Qu.:651.0 3rd Qu.:77.00

## Max. :186.00 Max. :713.0 Max. :93.00

## 'data.frame': 77 obs. of 5 variables:

# 1. Identify Missing Values

## Missing values in each column:

## center_id city_code region_code center_type op_area

# Handle missing values (option: remove rows with NA)

## Number of outliers in 'op_area': 9

# Optionally remove outliers

hist(df$region_code, breaks = 30, col = "orange", main = "Distribution

boxplot(df$region_code, col = "orange", main = "Boxplot of Region

cat("Mean Operational Area:", mean_op_area, "\n")

## Mean Operational Area: 3.913235

cat("Median Operational Area:", median_op_area, "\n")

## Median Operational Area: 3.9

high_demand_centers <- df %>% filter(op_area > mean_op_area)

## center_id city_code region_code center_type op_area

You might also like