0% found this document useful (0 votes)
5 views9 pages

A Data Science Project 2

The document outlines a data science project involving a global super-store's sales data, focusing on data cleaning, analysis, and visualization using Python libraries such as Pandas, Matplotlib, and Seaborn. Key tasks include handling missing values, identifying duplicates, and performing statistical analysis, including correlation and regression modeling. The project culminates in a linear regression model to predict sales based on profit and discount variables.

Uploaded by

23stcs21
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views9 pages

A Data Science Project 2

The document outlines a data science project involving a global super-store's sales data, focusing on data cleaning, analysis, and visualization using Python libraries such as Pandas, Matplotlib, and Seaborn. Key tasks include handling missing values, identifying duplicates, and performing statistical analysis, including correlation and regression modeling. The project culminates in a linear regression model to predict sales based on profit and discount variables.

Uploaded by

23stcs21
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 9

A DATA SCIENCE PROJECT

GLOBAL SUPER-STORE

AND

SALES DATA

TASL 2(1)

import pandas as pd

from google.colab import files

import matplotlib.pyplot as plt

import seaborn as sns

uploaded=files.upload()

data=pd.read_csv(list(uploaded.keys())[0], encoding='ISO-8859-1')

print("data")

print("data")

print(data.region())

print(data.product-category())

print(data.profit())

print(data.sales())

print("Any missing values")

print(data.isnull().sum())

print(data.notnull())

data=data.drop_duplicates()

print("Handling outliers")
print(Q1=data.quantile(0.25))

print(Q3=data.quantile(0.75))

IQR=Q3-Q1

print(IQR)

print("Descriptive Statistics")

print(data[['Sales','Profit']].describe())

print("Sales Variance:", data['Sales'].var())

print("Sales Standard Deviation:", data['Sales'].std())

print("Profit Variance:", data['Profit'].var())

print("Profit Standard Deviation:", data['Profit'].std())

print("correlation")

corr_matrix = data.corr(numeric_only=True)

print(corr_matrix[['Sales','Profit']])

sns.histplot(data['Sales'], kde=true)

plt.title("Sales Distribution")

plt.show()

sns.boxplot(x-data['Profit'])

plt.title("Profit Boxplot")

plt.show()

sns.heatmap(data.corr(numeric_only=true), annot=true, cmap='coolwarm')


plt.title("Correlation Heatmap")

plt.show()
SALES DATA

TASK 2(2)

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

uploaded=files.upload()

df=pd.read_csv(list(uploaded.keys())[0], encoding='ISO-8859-1')

print("The data:")

display(df.head())

print("Dataset Information")

df.info()

print("Statistical Summary")

display(df.describe())

duplicates= df,duplicates().sum()

print(f"Number of duplicate rows:{duplicates}")

df= df.drop_duplicates()

print(f"Missing values before cleaning:"{df.isnull().sum()}")

df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

df['Region'] = df['Region'].fillna(fd['Region'].mode()[0])
df['Date']= df['Date'].fillna(df['Date'].mode()[0])

print(f"Missing values after cleaning:{df.isnull().sum()}")

df['Date']= pd.to_datetime(df['Date'], format= '%d-%m-%Y')

print("Data after cleaning:")

display(df.head())

plt.figure(figsize=(8,6))

sns.scatterplot(x='Discount', y='Profit', data =df, color= 'orange')

plt.title('Profit vs Discount')

plt.xlabel('Discount')

plt.ylabel('Profit')

plt.show()

plt.figure(figsize=(8,6))

region_sales= df.groupby('Region')['Sales'].sum()

region_sales.plot(kind='bar',color='green')

plt.title('Sales by region')

plt.ylabel('Total Sales')

plt.show()

plt.figure(figsize=(8,6))

sns.heatmap(df.select_dtypes(include='number').corr(), annot=True,
cmap='coolwarm')

plt.title('Correlation Matrix')

plt.show()
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

X=df[['Profit', 'Discount']]

Y= df['Sales']

X_train, X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.2,


random_state=42)

model= LinearRegression()

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print(f"Mean Squared Error:{ mean_squared_error(Y_test, Y_pred):.3f}")

print(f"R-squared Score:{r2_score(Y_test, Y_pred):.2f}")

You might also like