Code Feature
Code Feature
# STEP 1: Select and rename relevant columns from raw invoice table
inv_selected = df_inv.select(
col("Invoice No").alias("inv_no"),
col("Order Line No").alias("order_line_no"),
lower(trim(col("Customer Code"))).alias("cust_code"),
col("Material Group").alias("product"),
col("Quantity").alias("quantity"),
col("Unit Price").alias("unit_price"),
col("Net Sales").alias("net_sales"),
col("Original Document").alias("return_inv"),
col("Invoice Date").alias("invoice_date"),
col("Payment Days").alias("payment_days")
)
# STEP 3: Filter valid return invoices (those whose return reference exists in
original)
valid_return_inv = return_inv.join(
original_inv.select("inv_no").distinct(),
return_inv.ret_return_inv == original_inv.inv_no,
how="inner"
).drop(original_inv.inv_no)
adjusted_inv = adjusted_inv.withColumn(
"adjusted_net_sales",
when(
col("ret.ret_net_sales").isNotNull(),
col("orig.net_sales") + col("ret.ret_net_sales")
).otherwise(col("orig.net_sales"))
)
df = df.withColumn(
"Actual_Quantity",
when(col("Quantity") > 0, col("Quantity")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Return_Quantity",
when(col("Quantity") < 0, -
col("Quantity")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Net_Sales_Positive",
when(col("Net Value") > 0, col("Net
Value")).otherwise(lit(0).cast(DoubleType()))
).withColumn(
"Net_Sales_Negative",
when(col("Net Value") < 0, -col("Net
Value")).otherwise(lit(0).cast(DoubleType()))
)
window_1 = Window.partitionBy("product_bu_id").orderBy("invoice_month_date") #
Time-ordered
window_2 = Window.partitionBy("product_bu_id") # Static stats
# Zero-Sales Metrics
monthly_agg_df = monthly_agg_df.withColumn(
"zero_sales",
when(col("monthly_sales_qty") == 0, 1).otherwise(0)
).withColumn(
"zero_sales_months",
spark_sum("zero_sales").over(window_2)
).withColumn(
"total_months",
count("monthly_sales_qty").over(window_2)
).withColumn(
"zero_sales_percent",
col("zero_sales_months") / col("total_months")
)
monthly_agg_df = monthly_agg_df.withColumn(
"monthly_avg_price",
when(col("monthly_sales_qty") != 0, col("monthly_sales_value") /
col("monthly_sales_qty")).otherwise(lit(0))
)
monthly_agg_df = monthly_agg_df.withColumn(
"price_growth_rate",
when(col("price_lag_1") > 0,
(col("monthly_avg_price") - col("price_lag_1")) / col("price_lag_1"))
.otherwise(lit(0))
)
monthly_agg_df = monthly_agg_df.withColumn(
"avg_monthly_price",
avg("monthly_avg_price").over(window_2)
).withColumn(
"std_monthly_price",
stddev("monthly_avg_price").over(window_2)
)
col_name="growth_rate"
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)
col_name="avg_growth_rate"
monthly_agg_df = monthly_agg_df.withColumn(
col_name,
when(col(col_name).isNull(), lit(0)).otherwise(col(col_name))
)
cluster_features_df = monthly_agg_df.groupBy("product_bu_id").agg(
avg("monthly_sales_qty").alias("avg_sales"),
stddev("monthly_sales_qty").alias("std_sales"),
avg("growth_rate").alias("avg_growth_rate"),
avg("zero_sales_percent").alias("zero_sales_ratio"),
max("monthly_sales_qty").alias("peak_sales"),
min("monthly_sales_qty").alias("min_sales"),
avg("monthly_avg_price").alias("avg_price"),
stddev("monthly_avg_price").alias("std_price"),
avg("price_growth_rate").alias("avg_price_growth")
).withColumn("cv_sales", col("std_sales") / col("avg_sales")) \
.withColumn("cv_price", col("std_price") / col("avg_price")) \
.fillna({
"avg_growth_rate": 0,
"std_sales": 0,
"cv_sales": 0,
"zero_sales_ratio": 0,
"avg_price": 0,
"std_price": 0,
"avg_price_growth": 0,
"cv_price": 0
})
features = [
"avg_sales", "std_sales", "avg_growth_rate", "zero_sales_ratio",
"peak_sales", "min_sales", "cv_sales",
"avg_price", "std_price", "avg_price_growth", "cv_price"
]
"zero_sales_ratio_rank","peak_sales_rank","min_sales_rank","cv_sales_rank","avg_pri
ce_rank","std_price_rank",
"avg_price_growth_rank","cv_price_rank" ],
outputCol="features_unscaled"
)
assembled_data = assembler.transform(cluster_input_df)
# Apply the model to the full scaled_data, not just select columns
clustered_data = model.transform(scaled_data)
# Create evaluator
evaluator = ClusteringEvaluator(featuresCol='scaled_features',
metricName='silhouette', distanceMeasure='squaredEuclidean')