0% found this document useful (0 votes)
5 views7 pages

Code Feature

The document outlines a PySpark script that processes invoice data to analyze sales metrics, including adjustments for returns and aggregating monthly sales data. It employs various transformations and window functions to calculate features such as growth rates, rolling averages, and zero-sales metrics, ultimately preparing the data for clustering analysis. The script concludes with K-means clustering to group products based on sales performance, evaluating the clustering effectiveness using silhouette scores.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views7 pages

Code Feature

The document outlines a PySpark script that processes invoice data to analyze sales metrics, including adjustments for returns and aggregating monthly sales data. It employs various transformations and window functions to calculate features such as growth rates, rolling averages, and zero-sales metrics, ultimately preparing the data for clustering analysis. The script concludes with K-means clustering to group products based on sales performance, evaluating the clustering effectiveness using silhouette scores.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

from pyspark.

sql import SparkSession


from pyspark.sql.functions import (
col, date_format, concat_ws, sum as spark_sum, avg, stddev, count,
lag, lead, when, lit,
array,to_date,isnan,min,max,row_number,greatest,lit,array_contains,exists,percent_r
ank
)
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType

df_inv = spark.sql("select * from sharepoint_db.inv_new")

df_mas = spark.sql("select * from sharepoint_db.product_master_data")

# STEP 1: Select and rename relevant columns from raw invoice table
inv_selected = df_inv.select(
col("Invoice No").alias("inv_no"),
col("Order Line No").alias("order_line_no"),
lower(trim(col("Customer Code"))).alias("cust_code"),
col("Material Group").alias("product"),
col("Quantity").alias("quantity"),
col("Unit Price").alias("unit_price"),
col("Net Sales").alias("net_sales"),
col("Original Document").alias("return_inv"),
col("Invoice Date").alias("invoice_date"),
col("Payment Days").alias("payment_days")
)

# STEP 2: Separate original and return invoices


original_inv = inv_selected.filter(trim(col("return_inv")).isNull() |
(trim(col("return_inv")) == ""))

return_inv = inv_selected.filter(trim(col("return_inv")).isNotNull() &


(trim(col("return_inv")) != ""))\
.withColumnRenamed("inv_no", "ret_inv_no")\
.withColumnRenamed("order_line_no", "ret_order_line_no")\
.withColumnRenamed("cust_code", "ret_cust_code")\
.withColumnRenamed("product", "ret_product")\
.withColumnRenamed("quantity", "ret_quantity")\
.withColumnRenamed("unit_price", "ret_unit_price")\
.withColumnRenamed("net_sales", "ret_net_sales")\
.withColumnRenamed("return_inv", "ret_return_inv")\
.withColumnRenamed("invoice_date", "ret_invoice_date")\
.withColumnRenamed("payment_days", "ret_payment_days")

# STEP 3: Filter valid return invoices (those whose return reference exists in
original)
valid_return_inv = return_inv.join(
original_inv.select("inv_no").distinct(),
return_inv.ret_return_inv == original_inv.inv_no,
how="inner"
).drop(original_inv.inv_no)

# STEP 4: Join original and return invoices


adjusted_inv = original_inv.alias("orig").join(
valid_return_inv.alias("ret"),
(col("orig.inv_no") == col("ret.ret_return_inv")) &
(col("orig.order_line_no") == col("ret.ret_order_line_no")) &
(col("orig.cust_code") == col("ret.ret_cust_code")),
how="left"
)

# STEP 5: Adjust quantity and net sales


adjusted_inv = adjusted_inv.withColumn(
"adjusted_quantity",
when(
col("ret.ret_unit_price").isNotNull() &
(col("orig.unit_price") != -col("ret.ret_unit_price")) &
(col("orig.quantity") == -col("ret.ret_quantity")),
col("orig.quantity")
).when(
col("ret.ret_quantity").isNotNull(),
col("orig.quantity") + col("ret.ret_quantity")
).otherwise(col("orig.quantity"))
)

adjusted_inv = adjusted_inv.withColumn(
"adjusted_net_sales",
when(
col("ret.ret_net_sales").isNotNull(),
col("orig.net_sales") + col("ret.ret_net_sales")
).otherwise(col("orig.net_sales"))
)

# STEP 6: Remove fully returned items


final_inv = adjusted_inv.filter(col("adjusted_net_sales") != 0)

# STEP 7: Final clean DataFrame with proper naming


inv_df = final_inv.select(
col("orig.inv_no").alias("Invoice No"),
col("orig.order_line_no").alias("Order Line No"),
col("orig.cust_code").alias("Customer Code"),
col("orig.product").alias("Material Group"),
col("adjusted_quantity").alias("Quantity"),
col("orig.unit_price").alias("Unit Price"),
col("adjusted_net_sales").alias("Net Sales"),
col("orig.invoice_date").alias("Invoice Date"),
col("orig.payment_days").alias("Payment Days"),
col("ret.ret_inv_no").alias("Returned By")
)

joined_df = inv_df.join(df_mas, on="Material Group", how="inner")

df = joined_df.withColumn("invoice_date_parsed", to_date(col("Invoice Date"),


"dd.MM.yyyy"))
df = df.withColumn("invoice_month", date_format(col("invoice_date_parsed"), "yyyy-
MM"))

df = df.withColumn("product_bu_id", concat_ws("_", col("Material"), col("SBU


Code")))

df = df.withColumn(
"Actual_Quantity",
when(col("Quantity") > 0, col("Quantity")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Return_Quantity",
when(col("Quantity") < 0, -
col("Quantity")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Net_Sales_Positive",
when(col("Net Value") > 0, col("Net
Value")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Net_Sales_Negative",
when(col("Net Value") < 0, -col("Net
Value")).otherwise(lit(0).cast(DoubleType()))
)

monthly_agg_df = df.groupBy("invoice_month", "product_bu_id").agg(


spark_sum("Actual_Quantity").alias("monthly_sales_qty"),
spark_sum("Net_Sales_Positive").alias("monthly_sales_value")
).orderBy("product_bu_id", "invoice_month")

# Convert Month to Date Type


monthly_agg_df = monthly_agg_df.withColumn(
"invoice_month_date",
to_date(col("invoice_month"), "yyyy-MM")
)

window_1 = Window.partitionBy("product_bu_id").orderBy("invoice_month_date") #
Time-ordered
window_2 = Window.partitionBy("product_bu_id") # Static stats

# window for each product bu, ordered by month


window_1 = Window.partitionBy("product_bu_id").orderBy("invoice_month_date")

# lag features - sales 1m,2m,3m ago


monthly_agg_df = monthly_agg_df.withColumn("lag_1", lag("monthly_sales_qty",
1).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("lag_2", lag("monthly_sales_qty",
2).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("lag_3", lag("monthly_sales_qty",
3).over(window_1))

# rolling avg - avg sales of last 3m


monthly_agg_df = monthly_agg_df.withColumn("rolling_3m_avg",
avg("monthly_sales_qty").over(window_1.rowsBetween(-2, 0)))

# growth rate - compared to prev mnth


monthly_agg_df = monthly_agg_df.withColumn("growth_rate", when(col("lag_1") > 0,
(col("monthly_sales_qty") - col("lag_1")) / col("lag_1")).otherwise(lit(None)))

# Static Statistics per Product-BU


monthly_agg_df = monthly_agg_df.withColumn(
"avg_monthly_sales_qty",
avg("monthly_sales_qty").over(window_2)
).withColumn(
"std_monthly_sales_qty",
stddev("monthly_sales_qty").over(window_2)
)

# Zero-Sales Metrics
monthly_agg_df = monthly_agg_df.withColumn(
"zero_sales",
when(col("monthly_sales_qty") == 0, 1).otherwise(0)
).withColumn(
"zero_sales_months",
spark_sum("zero_sales").over(window_2)
).withColumn(
"total_months",
count("monthly_sales_qty").over(window_2)
).withColumn(
"zero_sales_percent",
col("zero_sales_months") / col("total_months")
)

# Average Growth Rate per Product-BU


monthly_agg_df = monthly_agg_df.withColumn(
"avg_growth_rate",
avg("growth_rate").over(window_2)
)

monthly_agg_df = monthly_agg_df.withColumn(
"monthly_avg_price",
when(col("monthly_sales_qty") != 0, col("monthly_sales_value") /
col("monthly_sales_qty")).otherwise(lit(0))
)

# Lag Price Features


monthly_agg_df = monthly_agg_df.withColumn("price_lag_1", lag("monthly_avg_price",
1).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("price_lag_2", lag("monthly_avg_price",
2).over(window_1))
monthly_agg_df = monthly_agg_df.withColumn("price_lag_3", lag("monthly_avg_price",
3).over(window_1))

# Replace Null Lag Values


for col_name in ["price_lag_1", "price_lag_2", "price_lag_3"]:
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)

# Rolling Average of Price (Last 3 Months)


monthly_agg_df = monthly_agg_df.withColumn(
"rolling_price_3m_avg",
avg("monthly_avg_price").over(window_1.rowsBetween(-2, 0))
)

monthly_agg_df = monthly_agg_df.withColumn(
"price_growth_rate",
when(col("price_lag_1") > 0,
(col("monthly_avg_price") - col("price_lag_1")) / col("price_lag_1"))
.otherwise(lit(0))
)

monthly_agg_df = monthly_agg_df.withColumn(
"avg_monthly_price",
avg("monthly_avg_price").over(window_2)
).withColumn(
"std_monthly_price",
stddev("monthly_avg_price").over(window_2)
)

col_name="growth_rate"
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)

col_name="avg_growth_rate"
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)

# Target Variables (Next 3 Months)


for i in [1, 2, 3]:
monthly_agg_df = monthly_agg_df.withColumn(
f"y_{i}",
lead("monthly_sales_qty", i).over(window_1)
)

# Drop Rows with Missing Targets


monthly_agg_df = monthly_agg_df.na.drop(subset=["y_1", "y_2", "y_3"])

# Combine Targets into Array


monthly_agg_df = monthly_agg_df.withColumn(
"target",
array("y_1", "y_2", "y_3")
).drop("y_1", "y_2", "y_3")

from pyspark.sql.functions import avg, stddev, max, min, col

cluster_features_df = monthly_agg_df.groupBy("product_bu_id").agg(
avg("monthly_sales_qty").alias("avg_sales"),
stddev("monthly_sales_qty").alias("std_sales"),
avg("growth_rate").alias("avg_growth_rate"),
avg("zero_sales_percent").alias("zero_sales_ratio"),
max("monthly_sales_qty").alias("peak_sales"),
min("monthly_sales_qty").alias("min_sales"),
avg("monthly_avg_price").alias("avg_price"),
stddev("monthly_avg_price").alias("std_price"),
avg("price_growth_rate").alias("avg_price_growth")
).withColumn("cv_sales", col("std_sales") / col("avg_sales")) \
.withColumn("cv_price", col("std_price") / col("avg_price")) \
.fillna({
"avg_growth_rate": 0,
"std_sales": 0,
"cv_sales": 0,
"zero_sales_ratio": 0,
"avg_price": 0,
"std_price": 0,
"avg_price_growth": 0,
"cv_price": 0
})

features = [
"avg_sales", "std_sales", "avg_growth_rate", "zero_sales_ratio",
"peak_sales", "min_sales", "cv_sales",
"avg_price", "std_price", "avg_price_growth", "cv_price"
]

# Apply percent_rank for each feature


for feature in features:
rank_col = f"{feature}_rank"
w = Window.orderBy(col(feature))
cluster_features_df = cluster_features_df.withColumn(rank_col,
percent_rank().over(w))

# Select product_bu_id and all rank columns


rank_cols = [f"{f}_rank" for f in features]
cluster_input_df = cluster_features_df.select(["product_bu_id"] + rank_cols)

from pyspark.ml.feature import VectorAssembler, StandardScaler

# Step 1: Assemble features into a single vector


assembler = VectorAssembler(
inputCols=["avg_sales_rank", "std_sales_rank","avg_growth_rate_rank",

"zero_sales_ratio_rank","peak_sales_rank","min_sales_rank","cv_sales_rank","avg_pri
ce_rank","std_price_rank",
"avg_price_growth_rank","cv_price_rank" ],
outputCol="features_unscaled"
)
assembled_data = assembler.transform(cluster_input_df)

# Step 2: Scale features (mean=0, std=1)


scaler = StandardScaler(inputCol="features_unscaled", outputCol="scaled_features")
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)

from pyspark.ml.clustering import KMeans

# Define range of K values to test


k_values = range(2, 10)
wcss = []

# Compute WCSS for each K


for k in k_values:
kmeans = KMeans(k=k, seed=42).setFeaturesCol("scaled_features")
model = kmeans.fit(scaled_data)
wcss.append(model.summary.trainingCost)

# Plot Elbow Chart


import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
plt.plot(k_values, wcss, marker='o', linestyle='--')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Within-Cluster Sum of Squares (WCSS)")
plt.title("Elbow Method for Optimal K")
plt.grid(True)
plt.show()

from pyspark.ml.clustering import KMeans


# Apply K-means clustering
kmeans = KMeans(k=6, seed=42).setFeaturesCol("scaled_features") # Adjust k=6 via
elbow chart
model = kmeans.fit(scaled_data)
clustered_products = model.transform(scaled_data).select("product_bu_id",
"prediction")

from pyspark.ml.evaluation import ClusteringEvaluator

# Apply the model to the full scaled_data, not just select columns
clustered_data = model.transform(scaled_data)

# Create evaluator
evaluator = ClusteringEvaluator(featuresCol='scaled_features',
metricName='silhouette', distanceMeasure='squaredEuclidean')

# Evaluate silhouette score


silhouette_score = evaluator.evaluate(clustered_data)
print(f"Silhouette Score: {silhouette_score}")

You might also like