Sales Data Clustering
Sales Data Clustering
[46]: # This Python 3 environment comes with many helpful analytics libraries␣
↪installed
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can also write temporary files to /kaggle/temp/, but they won't be saved␣
↪outside of the current session
/kaggle/input/sample-sales-data/sales_data_sample.csv
1
df = pd.read_csv("/kaggle/input/sample-sales-data/sales_data_sample.csv",␣
↪encoding='latin1')
[48]: df.head()
[5 rows x 25 columns]
[49]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDERNUMBER 2823 non-null int64
1 QUANTITYORDERED 2823 non-null int64
2 PRICEEACH 2823 non-null float64
3 ORDERLINENUMBER 2823 non-null int64
4 SALES 2823 non-null float64
2
5 ORDERDATE 2823 non-null object
6 STATUS 2823 non-null object
7 QTR_ID 2823 non-null int64
8 MONTH_ID 2823 non-null int64
9 YEAR_ID 2823 non-null int64
10 PRODUCTLINE 2823 non-null object
11 MSRP 2823 non-null int64
12 PRODUCTCODE 2823 non-null object
13 CUSTOMERNAME 2823 non-null object
14 PHONE 2823 non-null object
15 ADDRESSLINE1 2823 non-null object
16 ADDRESSLINE2 302 non-null object
17 CITY 2823 non-null object
18 STATE 1337 non-null object
19 POSTALCODE 2747 non-null object
20 COUNTRY 2823 non-null object
21 TERRITORY 1749 non-null object
22 CONTACTLASTNAME 2823 non-null object
23 CONTACTFIRSTNAME 2823 non-null object
24 DEALSIZE 2823 non-null object
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
[50]: df.isnull().sum()
[50]: ORDERNUMBER 0
QUANTITYORDERED 0
PRICEEACH 0
ORDERLINENUMBER 0
SALES 0
ORDERDATE 0
STATUS 0
QTR_ID 0
MONTH_ID 0
YEAR_ID 0
PRODUCTLINE 0
MSRP 0
PRODUCTCODE 0
CUSTOMERNAME 0
PHONE 0
ADDRESSLINE1 0
ADDRESSLINE2 2521
CITY 0
STATE 1486
POSTALCODE 76
COUNTRY 0
TERRITORY 1074
3
CONTACTLASTNAME 0
CONTACTFIRSTNAME 0
DEALSIZE 0
dtype: int64
[51]: plt.figure(figsize=(8,6))
sns.histplot(df['SALES'], kde=True, color='blue', bins=30)
plt.title('Sales Distribution')
plt.xlabel('Sales Amount')
plt.ylabel('Frequency')
plt.show()
[52]: plt.figure(figsize=(8,6))
sns.scatterplot(x='QUANTITYORDERED', y='SALES', data=df, hue='DEALSIZE',␣
↪palette='coolwarm')
4
[53]: plt.figure(figsize=(8,6))
sns.boxplot(x='PRICEEACH', data=df, color='lightcoral')
plt.title('Price Each Distribution')
plt.xlabel('Price Each')
plt.show()
5
[54]: plt.figure(figsize=(10,6))
sns.barplot(x='PRODUCTLINE', y='SALES', data=df, estimator=sum, ci=None,␣
↪palette='viridis')
6
[55]: plt.figure(figsize=(8,6))
sales_per_year = df.groupby('YEAR_ID')['SALES'].sum().reset_index()
sns.lineplot(x='YEAR_ID', y='SALES', data=sales_per_year, marker='o',␣
↪color='dodgerblue')
7
[56]: plt.figure(figsize=(8,6))
sns.boxplot(x='DEALSIZE', y='SALES', data=df, palette='Set2')
plt.title('Deal Size Impact on Sales')
plt.xlabel('Deal Size')
plt.ylabel('Sales')
plt.show()
8
[57]: plt.figure(figsize=(12,6))
top_countries = df.groupby('COUNTRY')['SALES'].sum().nlargest(10).reset_index()
sns.barplot(x='COUNTRY', y='SALES', data=top_countries, palette='magma')
plt.title('Top 10 Countries by Total Sales')
plt.xticks(rotation=45)
plt.ylabel('Total Sales')
plt.show()
9
[58]: # Select numerical features for clustering
features = ['QUANTITYORDERED', 'PRICEEACH', 'SALES', 'MSRP']
X = df[features]
10
[61]: # Choose k based on the elbow method, e.g., k=3
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
11
[62]: # Generate linkage matrix using the 'ward' method
linked = linkage(X_scaled, method='ward')
12
[63]: # Choose the number of clusters, e.g., 3
hierarchical_clustering = AgglomerativeClustering(n_clusters=3,␣
↪affinity='euclidean', linkage='ward')
hc_clusters = hierarchical_clustering.fit_predict(X_scaled)
13
[64]: # Inertia for K-Means
inertia = kmeans.inertia_
print(f'Inertia: {inertia}')
14
from sklearn.metrics import calinski_harabasz_score
# Calinski-Harabasz Index for K-Means
kmeans_ch = calinski_harabasz_score(X_scaled, clusters)
print(f'Calinski-Harabasz Index (K-Means): {kmeans_ch}')
# Calinski-Harabasz Index for Hierarchical Clustering
hc_ch = calinski_harabasz_score(X_scaled, hc_clusters)
print(f'Calinski-Harabasz Index (Hierarchical): {hc_ch}')
Inertia: 4766.017133863959
Silhouette Score (K-Means): 0.3504523735351092
Silhouette Score (Hierarchical): 0.31527494740967016
Davies-Bouldin Index (K-Means): 1.0091895570978797
Davies-Bouldin Index (Hierarchical): 0.9888199869306714
Calinski-Harabasz Index (K-Means): 1930.6761983442068
Calinski-Harabasz Index (Hierarchical): 1672.3553032169289
15