A Data Science Project 2
A Data Science Project 2
GLOBAL SUPER-STORE
AND
SALES DATA
TASL 2(1)
import pandas as pd
uploaded=files.upload()
data=pd.read_csv(list(uploaded.keys())[0], encoding='ISO-8859-1')
print("data")
print("data")
print(data.region())
print(data.product-category())
print(data.profit())
print(data.sales())
print(data.isnull().sum())
print(data.notnull())
data=data.drop_duplicates()
print("Handling outliers")
print(Q1=data.quantile(0.25))
print(Q3=data.quantile(0.75))
IQR=Q3-Q1
print(IQR)
print("Descriptive Statistics")
print(data[['Sales','Profit']].describe())
print("correlation")
corr_matrix = data.corr(numeric_only=True)
print(corr_matrix[['Sales','Profit']])
sns.histplot(data['Sales'], kde=true)
plt.title("Sales Distribution")
plt.show()
sns.boxplot(x-data['Profit'])
plt.title("Profit Boxplot")
plt.show()
plt.show()
SALES DATA
TASK 2(2)
import pandas as pd
uploaded=files.upload()
df=pd.read_csv(list(uploaded.keys())[0], encoding='ISO-8859-1')
print("The data:")
display(df.head())
print("Dataset Information")
df.info()
print("Statistical Summary")
display(df.describe())
duplicates= df,duplicates().sum()
df= df.drop_duplicates()
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)
df['Region'] = df['Region'].fillna(fd['Region'].mode()[0])
df['Date']= df['Date'].fillna(df['Date'].mode()[0])
display(df.head())
plt.figure(figsize=(8,6))
plt.title('Profit vs Discount')
plt.xlabel('Discount')
plt.ylabel('Profit')
plt.show()
plt.figure(figsize=(8,6))
region_sales= df.groupby('Region')['Sales'].sum()
region_sales.plot(kind='bar',color='green')
plt.title('Sales by region')
plt.ylabel('Total Sales')
plt.show()
plt.figure(figsize=(8,6))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True,
cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
from sklearn.model_selection import train_test_split
X=df[['Profit', 'Discount']]
Y= df['Sales']
model= LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)