Project Intern - Jupyter Notebook
Project Intern - Jupyter Notebook
Out[51]:
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemploym
05-
0 1 02- 1643690.90 0 42.31 2.572 211.096358 8.
2010
12-
1 1 02- 1641957.44 1 38.51 2.548 211.242170 8.
2010
19-
2 1 02- 1611968.17 0 39.93 2.514 211.289143 8.
2010
In [52]: sales.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Store 6435 non-null int64
1 Date 6435 non-null object
2 Weekly_Sales 6435 non-null float64
3 Holiday_Flag 6435 non-null int64
4 Temperature 6435 non-null float64
5 Fuel_Price 6435 non-null float64
6 CPI 6435 non-null float64
7 Unemployment 6435 non-null float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB
In [54]: sales.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Store 6435 non-null int64
1 Date 6435 non-null datetime64[ns]
2 Weekly_Sales 6435 non-null float64
3 Holiday_Flag 6435 non-null int64
4 Temperature 6435 non-null float64
5 Fuel_Price 6435 non-null float64
6 CPI 6435 non-null float64
7 Unemployment 6435 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 402.3 KB
In [56]: sales.columns
In [57]: sales[sales.duplicated()]
Out[57]:
store date weekly_sales holiday_flag temperature fuel_price cpi unemployment
if level == 'lower':
return df[df[col] < lower_bound]
elif level == 'upper':
return df[df[col] > upper_bound]
else:
return df[(df[col] > upper_bound) | (df[col] < lower_bound)]
return outliers_df
Out[60]:
outlier_counts outlier_percent
Out[63]:
store date weekly_sales holiday_flag temperature fuel_price cpi unemploy
2010-
189 2 3436007.68 0 49.97 2.886 211.064660 8
12-24
2011-
241 2 3224369.80 0 46.66 3.112 218.999550 7
12-23
2010-
471 4 2789469.45 1 48.08 2.752 126.669267 7
11-26
2010-
474 4 2740057.14 0 46.57 2.884 126.879484 7
12-17
2010-
475 4 3526713.39 0 43.21 2.887 126.983581 7
12-24
2011-
523 4 3004702.33 1 47.96 3.225 129.836400 5
11-25
2011-
526 4 2771397.17 0 36.44 3.149 129.898065 5
12-16
2011-
527 4 3676388.98 0 35.92 3.103 129.984548 5
12-23
2010-
761 6 2727575.18 0 55.07 2.886 212.916508 7
12-24
2010-
1329 10 2939946.38 1 55.33 3.162 126.669267 9
11-26
2010-
1332 10 2811646.85 0 59.15 3.125 126.879484 9
12-17
2010-
1333 10 3749057.69 0 57.06 3.236 126.983581 9
12-24
2011-
1381 10 2950198.64 1 60.68 3.760 129.836400 7
11-25
2011-
1385 10 3487986.89 0 48.36 3.541 129.984548 7
12-23
2010-
1758 13 2766400.05 1 28.22 2.830 126.669267 7
11-26
2010-
1761 13 2771646.81 0 35.21 2.842 126.879484 7
12-17
2010-
1762 13 3595903.20 0 34.90 2.846 126.983581 7
12-24
2011-
1810 13 2864170.61 1 38.89 3.445 129.836400 6
11-25
2011-
1813 13 2760346.71 0 27.85 3.282 129.898065 6
12-16
2011-
1814 13 3556766.03 0 24.76 3.186 129.984548 6
12-23
2010-
1901 14 2921709.71 1 46.15 3.039 182.783277 8
11-26
2010-
1904 14 2762861.41 0 30.51 3.140 182.517732 8
12-17
2010-
1905 14 3818686.45 0 30.59 3.141 182.544590 8
12-24
2011-
1957 14 3369068.99 0 42.27 3.389 188.929975 8
12-23
2010-
2759 20 2811634.04 1 46.66 3.039 204.962100 7
11-26
2010-
2761 20 2752122.08 0 24.27 3.109 204.687738 7
10-12
2010-
2762 20 2819193.17 0 24.07 3.140 204.632119 7
12-17
2010-
2763 20 3766687.43 0 25.17 3.141 204.637673 7
12-24
2011-
2811 20 2906233.25 1 46.38 3.492 211.412076 7
11-25
2011-
2814 20 2762816.65 0 37.16 3.413 212.068504 7
12-16
2011-
2815 20 3555371.03 0 40.19 3.389 212.236040 7
12-23
2010-
3192 23 2734277.10 0 22.96 3.150 132.747742 5
12-24
2010-
3764 27 3078162.08 0 31.34 3.309 136.597273 8
12-24
2011-
3816 27 2739019.75 0 41.59 3.587 140.528765 7
12-23
In [64]: sales.describe()
Out[64]:
store weekly_sales holiday_flag temperature fuel_price cpi unem
In [65]: sales.hist(figsize=(30,20));
In [66]:
fig, ax = plt.subplots(figsize=(20, 5))
sns.lineplot(x=sales.date, y=(sales.weekly_sales/1e6))
plt.xlabel('months')
plt.ylabel('Weekly Sales (in million USD)')
plt.title('Weekly Sales Trend',fontdict={'fontsize': 16, 'color':'red'}, pad
annot = ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
bbox=dict(boxstyle="round", fc="w"),
arrowprops=dict(arrowstyle="->"))
annot.set_visible(False)
plt.show()
Out[67]:
store date weekly_sales holiday_flag temperature fuel_price cpi unemploymen
2010-
0 1 1643690.90 0 42.31 2.572 211.096358 8.10
05-02
2010-
1 1 1641957.44 1 38.51 2.548 211.242170 8.10
12-02
2010-
2 1 1611968.17 0 39.93 2.514 211.289143 8.10
02-19
Out[68]:
year 2010 2011 2012
month
df = df.groupby(col).mean().sort_values(by='weekly_sales', ascending=Fal
top_stores = df.head(5)
bottom_stores = df.tail(5)
sns.set_palette("bright")
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=bottom_stores.index, y=bottom_stores['weekly_sales']/1e6,
plt.title('Bottom 5 Stores by Average Sales')
plt.ylabel('Average weekly sales (millions USD)')
plt.show()
Out[76]:
store weekly_sales holiday_flag temperature fuel_price cpi employment year
model.fit(X_train, y_train)
# predict
y_pred = model.predict(X_test)
# calculate MSE
mse = mean_squared_error(y_test, y_pred)
# calculate RMSE
rmse = np.sqrt(mse)
return rmse
df = pd.DataFrame.from_dict(regressor_rmses, orient='index')
df = df.reset_index()
Out[84]:
regressor_name rmse