PRJ Sales Forecasting
PRJ Sales Forecasting
Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
display (os.getcwd())
dt = pd.read_csv('Train.csv')
display (dt.head())
Display Info
display (dt.info())
display (dt.isnull().sum())
Store the Categorical columns in a list
cat_col = []
for x in dt.dtypes.index:
if dt.dtypes[x] == 'object':
cat_col.append(x)
display (cat_col)
display (dt['Item_Weight'].isnull().sum())
Item_Weight_null['Item_Identifier'].value_counts()
Find the mean for the column – Item Weight group by Item Identifier
display (dt['Item_Identifier'])
Fill the missing values (Item Weight) with mean, the mean calculated by
group by Item identifier
dt.groupby('Outlet_Size').agg({'Outlet_Size': np.size})
display (dt['Outlet_Size'].isnull().sum())
Outlet_Size_null['Outlet_Type'].value_counts()
Create new attributes with first two characters of item identifier column
dt['New_Item_Type'] = dt['Item_Identifier'].apply(lambda x: x[:2])
display (dt['New_Item_Type'])
Display Number of records in each category
display (dt['New_Item_Type'].value_counts())
display (dt['Item_Fat_Content'].value_counts())
sns.distplot(dt['Item_MRP'])
plt.show()
sns.countplot(dt['Outlet_Location_Type'])
plt.show()
Print Co relation
corr = dt.corr()
display (corr)
dt.head()
Label Encoding
Create y
y = dt['Item_Outlet_Sales']
y.head()
pred = model.predict(X)
# perform cross-validation
cv_score = cross_val_score(model, X, y, scoring='neg_mean_squared_error',
cv=5)
cv_score = np.abs(np.mean(cv_score))
print("Model Report")
print("CV Score:", cv_score)
print("R2_Score:", r2_score(y,pred))
model = Ridge(normalize=True)
train(model, X_train, y_train)
coef = pd.Series(model.coef_, X.columns).sort_values()
coef.plot(kind='bar', title="Model Coefficients")
plt.show()
XG Boost Regressor
Random Search CV
Param Grid
random_grid = {
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
print(random_grid)
rf = RandomForestRegressor()
rf=RandomizedSearchCV(estimator = rf, param_distributions =
random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5,
verbose=2, random_state=42, n_jobs = 1)
display (rf.fit(X_train, y_train))
Best Parameters
print(rf.best_params_)
print(rf.best_score_)
predictions=rf.predict(X_test)
display (r2_score (y_test,predictions))
display (predictions)
lgb=LGBMRegressor()
lgb = RandomizedSearchCV(estimator = lgb, param_distributions =
params,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2,
random_state=42, n_jobs = 1)
lgb.fit(X,y)
Best Parameter
print(lgb.best_params_)
print(lgb.best_score_)
predictions=lgb.predict(X_test)
display (r2_score (y_test,predictions))
display (predictions)
sns.distplot(y_test-predictions)
plt.show()
Model XG Boost
params = {
"gamma": uniform(0, 0.5),
"learning_rate": uniform(0.03, 0.3), # default 0.1
"max_depth": randint(2, 6), # default 3
"n_estimators": randint(100, 150), # default 100
"subsample": uniform(0.6, 0.4)
}
XG Boost Regressor
print(xgb.best_params_)
print(xgb.best_score_)
predictions=xgb.predict(X_test)
display (r2_score (y_test,predictions))
display (predictions)
Create Dist plot
sns.distplot(y_test-predictions)
plt.show()