Load Prediction With 20 Models
Load Prediction With 20 Models
Perdition
Inspiration
About Dataset
Date Continuous-time data taken on the first of the month
import warnings
warnings.filterwarnings("ignore")
/opt/conda/lib/python3.7/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py:17: UserWarning: S
ince version 1.0, it is not needed to import enable_hist_gradient_boosting anymore. HistGradientBoostingClassif
ier and HistGradientBoostingRegressor are now stable and can be normally imported from sklearn.ensemble.
"Since version 1.0, "
# Color Palettes
colors = ["#bfd3e6", "#9b5b4f", "#4e4151", "#dbba78", "#bb9c55", "#909195","#dc1e1e","#a02933","#716807","#717cb4"
sns.palplot(sns.color_palette(colors))
#Default theme
sns.set_theme(palette='tab10',
font='Comic Sans MS',
font_scale=1.5,
rc=None)
import matplotlib
matplotlib.rcParams.update({'font.size': 15})
plt.style.use('dark_background')
plt.rcParams["axes.grid"] = False
df = pd.read_csv("/kaggle/input/steel-industry-energy-consumption/Steel_industry_data.csv")
df.head().style.background_gradient(cmap='copper').set_precision(2)
01/01/2018
0 3.17 2.95 0.00 0.00 73.21
00:15
01/01/2018
1 4.00 4.46 0.00 0.00 66.77
00:30
01/01/2018
2 3.24 3.28 0.00 0.00 70.28
00:45
01/01/2018
3 3.31 3.56 0.00 0.00 68.09
01:00
01/01/2018
4 3.82 4.50 0.00 0.00 64.72
01:15
corr = df.corr()
df.corr().style.background_gradient(cmap='copper').set_precision(2)
df1 = df.copy()
#Correlation with Response Variable class
X = df1.drop(['Usage_kWh'],axis=1)
y = df1['Usage_kWh']
X.corrwith(y).plot.bar(
figsize = (16, 5), title = "Correlation with Steel plant Load Distribution", fontsize = 15,
rot = 45, grid = False)
plt.show()
class color:
BOLD = '\033[1m'
END = '\033[0m'
print(f"\033[94m\033[1m")
print(color.BOLD + 'Missing values - Percentage: \n' + color.END)
print(f"\033[91m\033[1m")
print(round(df.isnull().mean() * 100, 2))
Missing values - Percentage:
date 0.0
Usage_kWh 0.0
Lagging_Current_Reactive.Power_kVarh 0.0
Leading_Current_Reactive_Power_kVarh 0.0
CO2(tCO2) 0.0
Lagging_Current_Power_Factor 0.0
Leading_Current_Power_Factor 0.0
NSM 0.0
WeekStatus 0.0
Day_of_week 0.0
Load_Type 0.0
dtype: float64
cat = df.select_dtypes(include='object').columns.tolist()
date
01/01/2018 00:15 1
01/09/2018 08:45 1
01/09/2018 07:15 1
01/09/2018 07:30 1
01/09/2018 07:45 1
..
02/05/2018 14:45 1
02/05/2018 14:30 1
02/05/2018 14:15 1
02/05/2018 14:00 1
31/12/2018 00:00 1
Name: date, Length: 35040, dtype: int64
===================================
WeekStatus
Weekday 25056
Weekend 9984
Name: WeekStatus, dtype: int64
===================================
Day_of_week
Monday 5088
Tuesday 4992
Wednesday 4992
Thursday 4992
Friday 4992
Saturday 4992
Sunday 4992
Name: Day_of_week, dtype: int64
===================================
Load_Type
Light_Load 18072
Medium_Load 9696
Maximum_Load 7272
Name: Load_Type, dtype: int64
===================================
01/01/2018
0 3.17 2.95 0.0 0.0 73.21 100.0
00:15
01/01/2018
1 4.00 4.46 0.0 0.0 66.77 100.0 1800
00:30
01/01/2018
2 3.24 3.28 0.0 0.0 70.28 100.0 2700
00:45
01/01/2018
3 3.31 3.56 0.0 0.0 68.09 100.0 3600
01:00
01/01/2018
4 3.82 4.50 0.0 0.0 64.72 100.0 4500
01:15
plt.figure(figsize=(18,4))
color = plt.cm.copper(np.linspace(0, 1, 10))
df.groupby(['WeekStatus','Day_of_week'])['Usage_kWh'].count().plot(kind='bar', width=.4,color=color);
plt.xticks(rotation=45);
ax2.scatter(data=df,x="Usage_kWh",y="Leading_Reactive_Power_kVarh", color=colors[8])
ax2.set_title("Usage(kWh) vs Leading Reactive Power(kVarh)",pad=20)
ax2.set_xlabel("Usage(kWh)")
ax2.set_ylabel("Leading Reactive Power (kVarh)")
ax4.scatter(data=df,x="Usage_kWh",y="Leading_Power_Factor", color=colors[9])
ax4.set_title("Usage(kWh) vs Leading Power Factor",pad=20)
ax4.set_xlabel("Usage(kWh)")
ax4.set_ylabel("Leading Power Factor")
ax5.scatter(data=df,x="Lagging_Reactive_Power_kVarh",y="Leading_Reactive_Power_kVarh", color=colors[2])
ax5.set_title("Lagging Reactive Power (kVarh) vs Leading Reactive Power(kVarh)",pad=20,fontsize=15)
ax5.set_xlabel("Lagging Reactive Power (kVarh)")
ax5.set_ylabel("Leading Reactive Power(kVarh)")
ax6.scatter(data=df,x="Lagging_Power_Factor",y="Leading_Power_Factor", color=colors[4])
ax6.set_title("Lagging Power Factor vs Leading Power Factor",pad=20,fontsize=15)
ax6.set_xlabel("Lagging Power Factor")
ax6.set_ylabel("Leading Power Factor")
ax7.scatter(data=df,x="Lagging_Reactive_Power_kVarh",y="Lagging_Power_Factor", color=colors[5])
ax7.set_title("Lagging Reactive Power (kVarh) vs Leading Power Factor",pad=20,fontsize=15)
ax7.set_xlabel("Lagging Reactive Power (kVarh)")
ax7.set_ylabel("Leading Power Factor")
ax8.scatter(data=df,x="Lagging_Reactive_Power_kVarh",y="Leading_Power_Factor", color=colors[4])
ax8.set_title("Lagging Reactive Power (kVarh) vs Leading Power Factor",pad=20,fontsize=15)
ax8.set_xlabel("Lagging Reactive Power (kVarh)")
ax8.set_ylabel("Leading Power Factor")
plt.show()
var = df['Lagging_Reactive_Power_kVarh']
color = colors[4]
fig = plt.figure(figsize = (18, 12))
plt.show();
plt.figure(figsize=(18,7))
sns.scatterplot(data=df, x="Usage_kWh", y="Lagging_Reactive_Power_kVarh", hue="Load_Type",palette="tab10");
plt.figure(figsize=(18,10))
plt.subplot(2,2,1)
sns.barplot(x = 'Load_Type', y = 'Usage_kWh', palette= "tab10",data=df)
plt.title("Load Type", color = "#bfd3e6")
plt.xlabel("Load_Type")
plt.ylabel("Usage_kWh")
plt.subplot(2,2,2)
df["WeekStatus"].value_counts().plot.pie(autopct='%1.2f%%', explode=[0.1, 0.1], colors=['blue','#dbba78'])
p = plt.gcf()
plt.title("Weekday/Weekend")
plt.legend()
plt.subplot(2,2,(3,4))
col = ['Usage_kWh']
fig, ax = plt.subplots(2, 1, sharex=True, figsize=(17,8),gridspec_kw={"height_ratios": (.2, .8)})
ax[0].set_title('Usage_kWh distribution',fontsize=18,pad=20)
sns.boxplot(x='Usage_kWh', data=df, ax=ax[0])
ax[0].set(yticks=[])
sns.histplot(x='Usage_kWh', data=df, ax=ax[1])
ax[1].set_xlabel(col, fontsize=16)
plt.axvline(df['Usage_kWh'].mean(), color='darkgreen', linewidth=2.2, label='mean=' + str(np.round(df['Usage_kWh'
plt.axvline(df['Usage_kWh'].median(), color='red', linewidth=2.2, label='median='+ str(np.round(df['Usage_kWh'].
plt.axvline(df['Usage_kWh'].mode()[0], color='purple', linewidth=2.2, label='mode='+ str(df['Usage_kWh'].mode()[
plt.legend(bbox_to_anchor=(1, 1.03), ncol=1, fontsize=17, fancybox=True, shadow=True, frameon=True)
plt.tight_layout()
plt.show()
col_names = ['Lagging_Reactive_Power_kVarh','Leading_Reactive_Power_kVarh','Lagging_Power_Factor','Leading_Power_Fact
fig, axs = plt.subplots(nrows=2,ncols=3,figsize=(20,10))
for i in range(0, len(col_names)):
rows = i // 3
cols = i % 3
ax = axs[rows,cols]
plot = sns.regplot(x = col_names[i], y = 'Usage_kWh', data = df, ax = ax )
===============================================================================================================
==============
===============================================================================================================
==============
===============================================================================================================
==============
old_skew = df.skew().sort_values(ascending=False)
old_skew
def logTrans(feature): # function to apply transformer and check the distribution with histogram and kdeplot
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.title("Distribution before Transformation", fontsize=20,color='red')
sns.histplot(df[feature], kde=True, color="red")
plt.xlabel(feature,color='Red')
plt.subplot(1,2,2)
df_log = pd.DataFrame(logTr.fit_transform(df))
plt.title("Distribution after Transformation", fontsize=20,color='Blue')
sns.histplot(df_log,bins=20, kde=True , legend=False)
plt.xlabel(feature,color='Blue')
plt.show()
logTrans(feature="Lagging_Reactive_Power_kVarh")
plt.figure(figsize=(18,4))
sns.kdeplot(data=df,x="Usage_kWh",hue='Load_Type',multiple="stack");
# Encode Categorical Columns
from sklearn.preprocessing import LabelEncoder
categ = df.select_dtypes(include = "object").columns
le = LabelEncoder()
df[categ] = df[categ].apply(le.fit_transform)
df.head()
# Split the dataset and prepare some lists to store the models
from sklearn.model_selection import train_test_split
X = df.drop(['Load_Type'], axis=1)
y = df.Load_Type
models = []
names = [
"LGBMClassifier",
"RidgeClassifierCV",
"XGBClassifier",
"QuadraticDiscriminantAnalysis",
"CalibratedClassifierCV",
"BernoulliNB",
"BaggingClassifier",
"LogisticRegression",
"NearestCentroid",
"SVC",
"LinearSVC",
"KNeighborsClassifier",
"GaussianNB",
"Perceptron",
"SGDClassifier",
"DecisionTreeClassifier",
"RandomForestClassifier",
"MLPClassifier",
"ExtraTreesClassifier",
"AdaBoostClassifier",
"NuSVC"
]
scores = []
clf =[
LGBMClassifier(),
RidgeClassifierCV(),
XGBClassifier(),
QuadraticDiscriminantAnalysis(),
CalibratedClassifierCV(),
BernoulliNB(),
BaggingClassifier(),
LogisticRegression(),
NearestCentroid(),
SVC(),
LinearSVC(),
KNeighborsClassifier(),
GaussianNB(),
Perceptron(),
SGDClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
MLPClassifier(),
ExtraTreesClassifier(),
AdaBoostClassifier(),
NuSVC()
%%time
for model in clf:
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
scores.append(score)
final_scores.sort_values(by='Accuracy',ascending=False).style.background_gradient(cmap="copper").set_properties(
'font-family': 'Comic Sans MS',
'color': 'Brown',
'font-size': '15px',"color": "Brown"
})
Classifier Accuracy
2 XGBClassifier 0.965068
0 LGBMClassifier 0.956963
6 BaggingClassifier 0.952740
15 DecisionTreeClassifier 0.936872
16 RandomForestClassifier 0.935274
18 ExtraTreesClassifier 0.921918
19 AdaBoostClassifier 0.855594
11 KNeighborsClassifier 0.796119
1 RidgeClassifierCV 0.744064
9 SVC 0.736986
20 NuSVC 0.729566
3 QuadraticDiscriminantAnalysis 0.713128
5 BernoulliNB 0.704452
12 GaussianNB 0.691210
7 LogisticRegression 0.671461
8 NearestCentroid 0.665525
10 LinearSVC 0.632534
17 MLPClassifier 0.613356
4 CalibratedClassifierCV 0.606050
14 SGDClassifier 0.605137
13 Perceptron 0.524087
p = plt.figure(figsize=(18,20))
p = sns.set_context('paper', font_scale=1.8)
p = final_scores=final_scores.sort_values(by='Accuracy',ascending=False)[:20]