0% found this document useful (0 votes)
5 views6 pages

Task 1

The document is an R script that processes a CSV file containing purchase behavior data. It summarizes the data, performs outlier detection and filtering on spending, and calculates metrics for customer segments based on 'LIFESTAGE' and 'PREMIUM_CUSTOMER'. Finally, it outputs insights and saves the metrics to a new CSV file.

Uploaded by

ria s
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views6 pages

Task 1

The document is an R script that processes a CSV file containing purchase behavior data. It summarizes the data, performs outlier detection and filtering on spending, and calculates metrics for customer segments based on 'LIFESTAGE' and 'PREMIUM_CUSTOMER'. Finally, it outputs insights and saves the metrics to a new CSV file.

Uploaded by

ria s
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

library(dplyr)

library(ggplot2)
library(stringr)

data <- read.csv("C:/Users/Ria


S/Downloads/QVI_purchase_behavi
our.csv", stringsAsFactors = FALSE)

cat("Data summary:\n")
print(summary(data))
cat("\nData structure:\n")
print(str(data))

if ("purchase_date" %in%
names(data)) {
data$purchase_date <-
as.Date(data$purchase_date)
}

if ("spending" %in% names(data)) {


boxplot(data$spending, main =
"Boxplot of Spending")
}
if ("quantity" %in% names(data)) {
boxplot(data$quantity, main =
"Boxplot of Quantity Purchased")
}

if ("spending" %in% names(data)) {


Q1 <- quantile(data$spending,
0.25, na.rm = TRUE)
Q3 <- quantile(data$spending,
0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

data <- data %>%


filter(spending >= lower_bound &
spending <= upper_bound)
cat(sprintf("Data filtered to remove
spending outliers outside [%.2f,
%.2f]\n", lower_bound,
upper_bound))
}

if ("product_name" %in%
names(data)) {
data$pack_size <-
as.numeric(str_extract(data$product
_name, "\\d+"))
data$brand_name <-
str_extract(data$product_name,
"^[^_ ]+")
}

required_segments <-
c("LIFESTAGE",
"PREMIUM_CUSTOMER")
if (all(required_segments %in%
names(data))) {
metrics <- data %>%
group_by(LIFESTAGE,
PREMIUM_CUSTOMER) %>%
summarise(
avg_spending =
mean(spending, na.rm = TRUE),
purchase_frequency = n(),
total_quantity = sum(quantity,
na.rm = TRUE),
avg_pack_size =
mean(pack_size, na.rm = TRUE)
) %>%
arrange(desc(avg_spending))

print(metrics)
} else {
cat("Warning: Required
segmentation columns not found in
data\n")
}

cat("\nInsights:\n")
cat("- Identify customer segments
with highest spending and purchase
frequency.\n")
cat("- Observe if premium
customers spend more and if
families with children show different
patterns.\n")
cat("- Look at preferred pack sizes
and brand preferences across
segments.\n")

write.csv(metrics,
"chip_customer_segment_metrics.c
sv", row.names = FALSE)
\

You might also like