0% found this document useful (0 votes)
14 views3 pages

Superstores Dataset Documentation

This document analyzes an e-commerce dataset using Python libraries like Pandas, NumPy, and Matplotlib. It explores relationships between variables like sales, categories, regions, and delivery times through visualizations like histograms, box plots, heatmaps and bar charts.

Uploaded by

ahmed salem
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views3 pages

Superstores Dataset Documentation

This document analyzes an e-commerce dataset using Python libraries like Pandas, NumPy, and Matplotlib. It explores relationships between variables like sales, categories, regions, and delivery times through visualizations like histograms, box plots, heatmaps and bar charts.

Uploaded by

ahmed salem
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

#importing the important libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

df = pd.read_csv("/content/train.csv") #reading the data


df

df.describe()

df.describe(include='O')

df.columns

df.info()

df.isnull().sum()

df = df.dropna(subset=['Postal Code']) #remove the 11 rows since they are small


df.isnull().sum()

df.duplicated()
df.drop_duplicates()

df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d/%m/%Y')


df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%d/%m/%Y')

df.drop(columns=["Row ID",'Postal Code'],inplace=True)


df

df.describe()

plt.figure(figsize=(8, 6))
sns.boxplot(df['Sales'])
plt.title('Box Plot of Sales')
plt.show()

df

fig = px.density_contour(df, x='Sales', title='Distribution of Sales')


fig.update_traces(contours_coloring="fill", colorscale="balance")

#the relation between sales and category

sns.barplot(x='Category', y='Sales', data=df ,palette="cubehelix")

plt.title('Sales by Category')
plt.xticks(rotation=45)
plt.show()

# Countplot of ship modes


plt.figure(figsize=(6, 6))
sns.countplot(x='Ship Mode', data=df,palette="Paired")
plt.title('Count of Ship Modes')
plt.xticks(rotation=45)
plt.show()
scatter_plot = px.scatter(df, x='Sub-Category', y='Sales', title='Sales vs. sub-
category',color='Category')
scatter_plot

products=df['Product Name'].nunique()
products

store_sales = df.groupby('Region')['Sales'].sum().reset_index()
print(store_sales)

plt.pie(store_sales['Sales'], labels=store_sales['Region'],autopct='%1.1f%%',
startangle=160,colors=['#008a72','#dc6a5d','#9474b4','#006cb2','#4eace7'])
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Sales Distribution by Store Location ')
plt.axis('equal')
plt.gca().set_xticks([])
plt.gca().set_yticks([])
plt.show()

sns.countplot(x=df['Region'],hue=df['Ship Mode'],palette='coolwarm')#Paired #muted


plt.show()

grouped_data = df.groupby(['Segment', 'Category'])['Sales'].sum().reset_index()


grouped_data

pivot_df = grouped_data.pivot(index='Segment', columns='Category', values='Sales')


pivot_df

plt.figure(figsize=(10, 6))
sns.heatmap(pivot_df, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title('Relationship between Customer Segment, Product Category, and Sales')
plt.xlabel('Product Category')
plt.ylabel('Customer Segment')
plt.show()

df["Delivery Days"]=df['Ship Date']-df['Order Date']


df

df['Delivery Days'].describe()

df['Delivery Days']=df['Delivery Days'].dt.days

df['Delivery Days'].describe()

plt.figure(figsize=(10, 6))
sns.kdeplot(df['Delivery Days'], shade=True)
plt.title('Distribution of Delivery days')
plt.xlabel('delivery days')
plt.ylabel('Density')
plt.show()

box_fig = px.box(df, x='Ship Mode', y='Delivery Days', title='Impact of Ship Mode


on Delivery Days',color='Ship Mode')
box_fig.show()

scatter_fig = px.scatter(df, x='Delivery Days', y='Sales', color='Ship Mode',


title='Impact of Delivery Days Variance on Sales',
labels={'Delivery Days Variance': 'Delivery Days
Variance', 'Sales': 'Sales'})

scatter_fig.show()

df.sample(5)

df['Year'] = df['Order Date'].dt.year


yearly_sales = df.groupby('Year')['Sales'].sum().reset_index()
yearly_sales

fig = px.line(yearly_sales, x='Year', y='Sales', title='Distribution of Sales Over


the Years')
fig.show()

customer_order_counts = df.groupby('Customer Name')['Order ID'].nunique()


frequents = customer_order_counts[customer_order_counts > 10]
num_frequents = len(frequents)
print("Number of frequent customers (with more than 10 orders):", num_frequents)

plt.figure(figsize=(14, 6))
frequents.plot(kind='bar')
plt.title('Number of Frequent Customers with More Than 10 Orders')
plt.xlabel('Customer Name')
plt.ylabel('Number of Orders')
plt.xticks(rotation=90)
plt.show()

nonfrequent=customer_order_counts[customer_order_counts <= 10]


num_nonfrequents=len(nonfrequent)

plt.figure(figsize=(6, 6))
plt.pie([num_frequents, num_nonfrequents], labels=['Frequent', 'Not Frequent'],
autopct='%1.1f%%', colors=['#8cb5db', '#8c8c8c'])
plt.title('Relationship Between Frequent and Not Frequent Clients')
plt.show()

df['Offer'] = df['Customer Name'].apply(lambda name: 'Offer' if name in frequents


else 'No Offer')
df

You might also like