Bigmartsalesprediction
Bigmartsalesprediction
In [5]: !pip install pandas numpy seaborn matplotlib klib dtale scikit-learn joblib pandas-
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 1/27
10/7/22, 8:59 PM bigmartsalesprediction
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 2/27
10/7/22, 8:59 PM bigmartsalesprediction
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 3/27
10/7/22, 8:59 PM bigmartsalesprediction
In [12]: df_train.head() # displays the first five rows of the dataframe by default
Fruits and
3 FDX07 19.20 Regular 0.000000 182.0950
Vegetables
In [13]: #df_test.head()
In [14]: df_train.shape # a tuple of array dimensions that tells the number of rows and col
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 4/27
10/7/22, 8:59 PM bigmartsalesprediction
Out[15]: Item_Identifier 0
Item_Weight 1463
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Identifier 0
Outlet_Establishment_Year 0
Outlet_Size 2410
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
In [16]: df_test.isnull().sum()
Out[16]: Item_Identifier 0
Item_Weight 976
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Identifier 0
Outlet_Establishment_Year 0
Outlet_Size 1606
Outlet_Location_Type 0
Outlet_Type 0
dtype: int64
In [17]: df_train.info() #seeing the detailed info of the dataset and its types of target
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Item_Identifier 8523 non-null object
1 Item_Weight 7060 non-null float64
2 Item_Fat_Content 8523 non-null object
3 Item_Visibility 8523 non-null float64
4 Item_Type 8523 non-null object
5 Item_MRP 8523 non-null float64
6 Outlet_Identifier 8523 non-null object
7 Outlet_Establishment_Year 8523 non-null int64
8 Outlet_Size 6113 non-null object
9 Outlet_Location_Type 8523 non-null object
10 Outlet_Type 8523 non-null object
11 Item_Outlet_Sales 8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 5/27
10/7/22, 8:59 PM bigmartsalesprediction
Out[21]: Item_Identifier 0
Item_Weight 0
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Identifier 0
Outlet_Establishment_Year 0
Outlet_Size 2410
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
In [22]: df_train['Item_Weight'].describe()
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 6/27
10/7/22, 8:59 PM bigmartsalesprediction
Out[23]: 0 Medium
1 Medium
2 Medium
3 NaN
4 High
...
8518 High
8519 NaN
8520 Small
8521 Medium
8522 Small
Name: Outlet_Size, Length: 8523, dtype: object
In [24]: df_train['Outlet_Size'].value_counts()
In [25]: df_train['Outlet_Size'].mode()
Out[25]: 0 Medium
Name: Outlet_Size, dtype: object
In [26]: df_train['Outlet_Size'].fillna(df_train['Outlet_Size'].mode()[0],inplace=True)
df_test['Outlet_Size'].fillna(df_test['Outlet_Size'].mode()[0],inplace=True)
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 7/27
10/7/22, 8:59 PM bigmartsalesprediction
Out[27]: Item_Identifier 0
Item_Weight 0
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Identifier 0
Outlet_Establishment_Year 0
Outlet_Size 0
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
In [28]: df_test.isnull().sum()
Out[28]: Item_Identifier 0
Item_Weight 0
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Identifier 0
Outlet_Establishment_Year 0
Outlet_Size 0
Outlet_Location_Type 0
Outlet_Type 0
dtype: int64
In [30]: df_train
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 8/27
10/7/22, 8:59 PM bigmartsalesprediction
Fruits and
3 19.200 Regular 0.000000 182.0950
Vegetables
Snack
8518 6.865 Low Fat 0.056783 214.5218
Foods
Baking
8519 8.380 Regular 0.046982 108.1570
Goods
Health and
8520 10.600 Low Fat 0.035186 85.1224
Hygiene
Snack
8521 7.210 Regular 0.145221 103.1332
Foods
In [31]: df_test
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 9/27
10/7/22, 8:59 PM bigmartsalesprediction
Snack
0 20.750000 Low Fat 0.007565 107.8622
Foods
Snack
3 7.315000 Low Fat 0.015388 155.0340
Foods
Snack
5676 10.500000 Regular 0.013496 141.3154
Foods
Starchy
5677 7.600000 Regular 0.142991 169.1448
Foods
Health and
5678 10.000000 Low Fat 0.073529 118.7440
Hygiene
In [33]: dtale.show(df_train)
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 10/27
10/7/22, 8:59 PM bigmartsalesprediction
‣ 0
0
Out[33]:
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 11/27
10/7/22, 8:59 PM bigmartsalesprediction
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 12/27
10/7/22, 8:59 PM bigmartsalesprediction
stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)
Requirement already satisfied: pywin32>=1.0 in c:\python3107\lib\site-packages (fr
om jupyter-core>=4.9.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (30
4)
Requirement already satisfied: six>=1.5 in c:\python3107\lib\site-packages (from p
ython-dateutil>=2.8.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (1.1
6.0)
Note: you may need to restart the kernel to use updated packages.
In [39]: profile
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 13/27
10/7/22, 8:59 PM bigmartsalesprediction
Variables
Item_Weight
Real number (ℝ≥0)
HIGH
CORRELATION (This variable has a high correlation with 1 fields:
Outlet_Type)
Distinct 416
Missing 0
Infinite 0
Mean 12.85764518
Minimum 4.555
Maximum 21.35
Zeros 0
Negative 0
Out[39]:
In [40]: plt.figure(figsize=(10,5))
sns.heatmap(df_train.corr(),annot=True)
plt.show()
2022-10-07 20:54:08,791 - WARNING - findfont: Font family ['Heiti TC'] not found.
Falling back to DejaVu Sans.
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 14/27
10/7/22, 8:59 PM bigmartsalesprediction
Out[45]: GridSpec(6, 5)
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 15/27
10/7/22, 8:59 PM bigmartsalesprediction
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 16/27
10/7/22, 8:59 PM bigmartsalesprediction
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 17/27
10/7/22, 8:59 PM bigmartsalesprediction
Dropped rows: 0
of which 0 duplicates. (Rows (first 150 shown): [])
Dropped columns: 0
of which 0 single valued. Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.46 MB (-70.77%)
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 18/27
10/7/22, 8:59 PM bigmartsalesprediction
Fruits and
3 19.200001 Regular 0.000000 182.095001 1
Vegetables
Snack
8518 6.865000 Low Fat 0.056783 214.521805 1
Foods
Baking
8519 8.380000 Regular 0.046982 108.156998 2
Goods
Health and
8520 10.600000 Low Fat 0.035186 85.122398 2
Hygiene
Snack
8521 7.210000 Regular 0.145221 103.133202 2
Foods
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 19/27
10/7/22, 8:59 PM bigmartsalesprediction
Fruits and
3 19.200 Regular 0.000000 182.0950 19
Vegetables
Snack
8518 6.865 Low Fat 0.056783 214.5218 19
Foods
Baking
8519 8.380 Regular 0.046982 108.1570 20
Goods
Health and
8520 10.600 Low Fat 0.035186 85.1224 20
Hygiene
Snack
8521 7.210 Regular 0.145221 103.1332 20
Foods
In [56]: df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 item_weight 8523 non-null float64
1 item_fat_content 8523 non-null object
2 item_visibility 8523 non-null float64
3 item_type 8523 non-null object
4 item_mrp 8523 non-null float64
5 outlet_establishment_year 8523 non-null int64
6 outlet_size 8523 non-null object
7 outlet_location_type 8523 non-null object
8 outlet_type 8523 non-null object
9 item_outlet_sales 8523 non-null float64
dtypes: float64(4), int64(1), object(5)
memory usage: 666.0+ KB
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 20/27
10/7/22, 8:59 PM bigmartsalesprediction
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 item_weight 8523 non-null float32
1 item_fat_content 8523 non-null category
2 item_visibility 8523 non-null float32
3 item_type 8523 non-null category
4 item_mrp 8523 non-null float32
5 outlet_establishment_year 8523 non-null int16
6 outlet_size 8523 non-null category
7 outlet_location_type 8523 non-null category
8 outlet_type 8523 non-null category
9 item_outlet_sales 8523 non-null float32
dtypes: category(5), float32(4), int16(1)
memory usage: 192.9 KB
In [58]: klib.mv_col_handling(df_train)
Fruits and
3 19.200001 Regular 0.000000 182.095001 1
Vegetables
Snack
8518 6.865000 Low Fat 0.056783 214.521805 1
Foods
Baking
8519 8.380000 Regular 0.046982 108.156998 2
Goods
Health and
8520 10.600000 Low Fat 0.035186 85.122398 2
Hygiene
Snack
8521 7.210000 Regular 0.145221 103.133202 2
Foods
1) Label encoding
In [59]: from sklearn.preprocessing import LabelEncoder
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 21/27
10/7/22, 8:59 PM bigmartsalesprediction
le=LabelEncoder()
In [61]: df_train.head(5)
In [63]: Y=df_train['item_outlet_sales']
3)Standarization
In [65]: X.describe()
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 22/27
10/7/22, 8:59 PM bigmartsalesprediction
In [69]: X_train_std
In [70]: X_test_std
In [71]: Y_train
In [72]: Y_test
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 23/27
10/7/22, 8:59 PM bigmartsalesprediction
In [74]: joblib.dump(sc,r'D:\5th_semester\MiniProject2A\Projectworking\models\sc.sav')
Out[74]: ['D:\\5th_semester\\MiniProject2A\\Projectworking\\models\\sc.sav']
Model building
In [75]: X_test.head()
Linear Regression
In [77]: from sklearn.linear_model import LinearRegression
lr= LinearRegression()
In [78]: lr.fit(X_train_std,Y_train)
Out[78]: ▾ LinearRegression
LinearRegression()
In [79]: Y_pred_lr=lr.predict(X_test_std)
In [80]: print(r2_score(Y_test,Y_pred_lr))
print(mean_absolute_error(Y_test,Y_pred_lr))
print(np.sqrt(mean_squared_error(Y_test,Y_pred_lr)))
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 24/27
10/7/22, 8:59 PM bigmartsalesprediction
0.5041875773270634
880.99990440845
1162.4412631603452
In [81]: joblib.dump(lr,r'D:\5th_semester\MiniProject2A\Projectworking\models\lr.sav')
Out[81]: ['D:\\5th_semester\\MiniProject2A\\Projectworking\\models\\lr.sav']
In [83]: rf.fit(X_train_std,Y_train)
Out[83]: ▾ RandomForestRegressor
RandomForestRegressor(n_estimators=1000)
In [85]: print(r2_score(Y_test,Y_pred_rf))
print(mean_absolute_error(Y_test,Y_pred_rf))
print(np.sqrt(mean_squared_error(Y_test,Y_pred_rf)))
0.5486175811867917
782.141215387397
1109.1355754589515
In [86]: joblib.dump(rf,r'D:\5th_semester\MiniProject2A\Projectworking\models\rf.sav')
Out[86]: ['D:\\5th_semester\\MiniProject2A\\Projectworking\\models\\rf.sav']
XG Boost Regressor
In [87]: from xgboost import XGBRegressor
xg= XGBRegressor()
Out[88]: ▾ XGBRegressor
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 25/27
10/7/22, 8:59 PM bigmartsalesprediction
In [90]: print(r2_score(Y_test,Y_pred_xg))
print(mean_absolute_error(Y_test,Y_pred_xg))
print(np.sqrt(mean_squared_error(Y_test,Y_pred_xg)))
0.5313160637898305
800.45557
1130.1923
In [91]: joblib.dump(rf,r'D:\5th_semester\MiniProject2A\Projectworking\models\xg.sav')
Out[91]: ['D:\\5th_semester\\MiniProject2A\\Projectworking\\models\\xg.sav']
grid_search_forest.fit(X_train_std, Y_train)
# summarize results
print(f"Best: {grid_search_forest.best_score_:.3f} using {grid_search_forest.best_p
means = grid_search_forest.cv_results_['mean_test_score']
stds = grid_search_forest.cv_results_['std_test_score']
params = grid_search_forest.cv_results_['params']
In [93]: grid_search_forest.best_params_
In [94]: grid_search_forest.best_score_
Out[94]: 0.5493344344113504
In [95]: Y_pred_rf_grid=grid_search_forest.predict(X_test_std)
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 26/27
10/7/22, 8:59 PM bigmartsalesprediction
In [96]: r2_score(Y_test,Y_pred_rf_grid)
Out[96]: 0.5489701766293793
In [98]: joblib.dump(grid_search_forest,r'D:\5th_semester\MiniProject2A\Projectworking\rando
Out[98]: ['D:\\5th_semester\\MiniProject2A\\Projectworking\\random_forest_grid.sav']
In [99]: model=joblib.load(r'D:\5th_semester\MiniProject2A\Projectworking\random_forest_grid
In [100… model.predict(X_test_std)
In [ ]:
localhost:8888/nbconvert/html/bigmartsalesprediction.ipynb?download=false 27/27