0% found this document useful (0 votes)
3 views

codeppsjf

Uploaded by

SANDESH NARENDRA
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views

codeppsjf

Uploaded by

SANDESH NARENDRA
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 16

Inspecting the Dataset

In [ ]:
df.head()
In [ ]:
df.info()
In [ ]:
df.describe(include='all').T

Data Preprocessing
Ensuring that no missing values are present in the dataset
In [ ]:
# Checking for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values[missing_values > 0])
Ensuring that no missing values are concealed as question marks.
In [ ]:
df.replace("?", np.nan, inplace=True)
To facilitate subsequent processing, converting all columns into float data type.
In [ ]:
for column in df.columns:
try:
df[column] = df[column].astype(float)
except ValueError:
pass
Simply verify the descriptions of numeric features for absence of missing values and apparent outliers.
In [ ]:
# Selecting numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Generating summary statistics


summary_statistics = numeric_df.describe(include='all').T

# Displaying the summary statistics table


summary_statistics
We are removing the features UDI and Product ID as they do not add value to the classification models
being built.
In [ ]:
df.drop(['UDI', 'Product ID'], axis=1, inplace=True)
We can notice from the dataset that the machine failure modes are represented through the features
- TWF, HDF, PWF, OSF, and RNF. Therefore, we must combine the failure modes into a single feature for
multiclass classification and subsequently eliminate the individual failure modes.
In [ ]:
df['Machine failure'] = 0

# Assigning failure codes based on different factors


df.loc[df['TWF'] == 1, 'Machine failure'] = 1
df.loc[df['HDF'] == 1, 'Machine failure'] = 2
df.loc[df['PWF'] == 1, 'Machine failure'] = 3
df.loc[df['OSF'] == 1, 'Machine failure'] = 4
df.loc[df['RNF'] == 1, 'Machine failure'] = 5

# Dropping derived features


df.drop(['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1, inplace=True)
Performing one-hot encoding on categorical variables, dropping the first category
In [ ]:
df = pd.get_dummies(df,drop_first=True)
df = df.astype(int)
Substituting missing numeric values with the mean
In [ ]:
# Filling missing values in columns with mean
df.fillna(df.mean(), inplace=True)

# Filling missing values in all columns


for column in df.columns:
try:
df[column].fillna(df[column].mean(), inplace=True)
except AttributeError:
try:
df[column].fillna(df[column].mode()[0], inplace=True)
except:
pass
In [ ]:
# Displaying the updated DataFrame
df.head()

Feature Engineering
These calculations create new features based on existing columns in the DataFrame, which can potentially
provide additional insights for analysis or modeling purposes.

1. Power = Rotational speed [rpm] * Torque [Nm]: Calculates the power by multiplying the rotational
speed (in revolutions per minute) with the torque (in Newton meters).

2. Power wear = Power * Tool wear [min]: Calculates the power wear by multiplying the power with
the tool wear (in minutes).

3. Temperature difference = Process temperature [K] - Air temperature [K]: Computes the temperature
difference by subtracting the air temperature (in Kelvin) from the process temperature (in Kelvin).

4. Temperature power = Temperature difference / Power: Calculates the temperature power by dividing
the temperature difference by the power.

In [ ]:
# Calculating new features based on the existing columns
df['Power'] = df['Rotational speed [rpm]'] * df['Torque [Nm]']
df['Power wear'] = df['Power'] * df['Tool wear [min]']
df['Temperature difference'] = df['Process temperature [K]'] - df['Air
temperature [K]']
df['Temperature power'] = df['Temperature difference'] / df['Power']
In [ ]:
df.describe(include='all').T

Data Visualizations

In [ ]:
# Counting the occurrences of each machine failure mode
failure_counts = df['Machine failure'].value_counts()

# Define labels for each failure type


failure_labels = {
0: 'No Failure',
1: 'Tool Wear Failure',
2: 'Heat Dissipation Failure',
3: 'Power Failure',
4: 'Overstrain Failure',
5: 'Random Failure'
}

# Plotting the distribution of machine failures on a logarithmic scale


plt.figure(figsize=(10, 6))
failure_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Machine Failures (Logarithmic Scale)')
plt.xlabel('Failure Mode')
plt.ylabel('Count (Log Scale)')
plt.xticks(range(len(failure_labels)), [failure_labels[i] for i in
range(len(failure_labels))], rotation=45) # Setting custom tick labels
plt.yscale('log') # Setting y-axis scale to logarithmic
plt.show()

Pair Plot of Features with Respect to Machine Failure

In [ ]:
plt.figure(figsize=(100, 100))
sns.pairplot(df, hue='Machine failure')
plt.suptitle('Relationship between Features with Respect to Machine
Failure')
plt.tight_layout()
plt.show()

Correlation Heat Map

In [ ]:
# Calculating the correlation matrix
corr_matrix = df.corr().round(2)

# Creating a heatmap of the correlation matrix


plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()

Feature Distribution Comparison Based on Machine Failure: Box Plot


Analysis
In [ ]:
features = ["Air temperature [K]", "Process temperature [K]", "Rotational
speed [rpm]", "Torque [Nm]", "Tool wear [min]", "Power", "Power wear",
"Temperature difference", "Temperature power", "Type_L", "Type_M"]

# Creating a grid of boxplots to visualize the distribution of features


grouped by the "Machine Failure" variable
plt.figure(figsize=(12, 8))
for feature in features:
plt.subplot(3, 4, features.index(feature) + 1)
sns.boxplot(x="Machine failure", y=feature, data=df)
plt.tight_layout()
plt.suptitle("Feature Distribution Analysis by Machine Failure")
plt.tight_layout()
plt.show()

Standardization
By standardizing the features, we ensure that they have a mean of 0 and a standard deviation of 1, making
them suitable for algorithms that assume normally distributed data or require standardized features for
optimal performance. This is crucial because features often have vastly different ranges of values, which
can complicate the model's ability to identify relationships among them.
In [ ]:
# Separating features and target variable
X = df.drop('Machine failure', axis=1)
y = df['Machine failure']

# Standardizing the features


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Principal Component Analysis


Performing Principal Component Analysis to reduce dimensionality and preserve variance
In [ ]:
# Applying PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Checking the number of components selected


print("Number of Principal Components:", pca.n_components_)

# Concatenating the principal components with the target variable


df_pca = pd.DataFrame(data=X_pca, columns=[f'PC{i+1}' for i in
range(X_pca.shape[1])])
df_pca['Machine failure'] = y.values

# Displaying the transformed DataFrame


df_pca.head()
In [ ]:
# Extracting the names of the original features
feature_names = X.columns

# Extrating the principal component axes


principal_axes = pca.components_

# Creating a DataFrame to display the feature names along with their


importance in each principal component
feature_importance_df = pd.DataFrame(principal_axes.T, columns=[f'PC{i+1}'
for i in range(principal_axes.shape[0])], index=feature_names)

# Displaying the DataFrame


print("Feature Importance in Principal Components:")
feature_importance_df
In [ ]:
# Plotting explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-
')
plt.xlabel('Number of Components')
plt.axhline(y=0.99, color='red', linestyle='--', label='99% Cut-Off
Threshold')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Components')
plt.grid(True)
plt.legend()
plt.show()
Based on our analysis of the cumulative explained variance ratio plot, we have determined that 99% of the
variance in the data can be explained by retaining only the first 6 principal components. Consequently, as
part of the dimensionality reduction process, we have opted to retain solely these 6 principal components.
This decision enables us to capture the majority of the variance within the data while significantly reducing
its dimensionality, which can offer computational benefits, enhance model performance, and facilitate
clearer data interpretation.
In [ ]:
pca = PCA(n_components=6)
X_pca = pca.fit_transform(X_scaled)

Feature Selection
We are implementing machine learning models using exclusively the features that have been engineered
through our feature engineering process. These features, namely Power, Power wear, Temperature
difference, and Temperature power, have been derived to capture relevant information from the dataset.
Each of these features plays a critical role in understanding and predicting the various failure modes
observed in the manufacturing process.

Specifically, the engineered features encapsulate distinct failure modes such as tool wear failure (TWF),
heat dissipation failure (HDF), power failure (PWF), overstrain failure (OSF), and random failures (RNF).
For instance, TWF occurs when the tool reaches a randomly selected wear time between 200 and 240
minutes, resulting in either replacement or failure. HDF occurs when the temperature difference between
air and process falls below 8.6 K, coupled with a rotational speed below 1380 rpm. PWF manifests when
the power required for the process, derived from the product of torque and rotational speed, falls outside
the range of 3500 W to 9000 W. OSF, on the other hand, is triggered when the product of tool wear and
torque exceeds specific thresholds for different product variants. Additionally, a small percentage of
random failures (RNF) occur independent of process parameters.
In [ ]:
selected_features = ['Power', 'Power wear', 'Temperature difference',
'Temperature power']
X_fs = df[selected_features]
Data Splitting
The dataset will be split into train and test sets by 80:20 ratio
In [ ]:
# Splitting the dataset into train and test sets (80% train, 20% test) for
PCA
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca,
y, test_size=0.2, random_state=42)
In [ ]:
# Splitting the dataset into train and test sets (80% train, 20% test) for
LDA
X_train_lda, X_test_lda, y_train_lda, y_test_lda = train_test_split(X, y,
test_size=0.2, random_state=42)
In [ ]:
# Splitting the dataset into train and test sets (80% train, 20% test) for
Feature Selection
X_train_fs, X_test_fs, y_train_fs, y_test_fs = train_test_split(X_fs, y,
test_size=0.2, random_state=42)

Data Imbalance Handling


Performing oversampling only on the training data to avoid data leakage
In [ ]:
ros = RandomOverSampler(random_state=42)

Data Imbalance Handling for Principal Component Analysis (PCA)

In [ ]:
# Applying RandomOverSampler to address data imbalance only on the training
data of PCA
X_train_resampled_pca, y_train_resampled_pca =
ros.fit_resample(X_train_pca, y_train_pca)

# Checking the shape of the resampled training data of PCA


print("Shape of X_train_resampled_pca:", X_train_resampled_pca.shape)
print("Shape of y_train_resampled_pca:", y_train_resampled_pca.shape)

Data Imbalance Handling for Linear Discriminant Analysis (LDA)

In [ ]:
# Applying RandomOverSampler to address data imbalance only on the training
data of LDA
X_train_resampled_lda, y_train_resampled_lda =
ros.fit_resample(X_train_lda, y_train_lda)

# Checking the shape of the resampled training data of LDA


print("Shape of X_train_resampled_lda:", X_train_resampled_lda.shape)
print("Shape of y_train_resampled_lda:", y_train_resampled_lda.shape)

Data Imbalance Handling for Feature Selection


In [ ]:
# Applying RandomOverSampler to address data imbalance only on the training
data of LDA
ros = RandomOverSampler(random_state=42)
X_train_resampled_fs, y_train_resampled_fs = ros.fit_resample(X_train_fs,
y_train_fs)

# Checking the shape of the resampled training data of LDA


print("Shape of X_train_resampled_fs:", X_train_resampled_fs.shape)
print("Shape of y_train_resampled_fs:", y_train_resampled_fs.shape)

Linear Discriminant Analysis

In [ ]:
# Applying Linear Discriminant Analysis (LDA)
lda = LinearDiscriminantAnalysis()

X_train_resampled_lda = lda.fit_transform(X_train_resampled_lda,
y_train_resampled_lda)
X_test_lda = lda.transform(X_test_lda)

Logistic Regression

In [ ]:
# Logistic Regression model with GridSearchCV for hyperparameter tuning
with PCA
logistic_regression = LogisticRegression(max_iter=1000)
param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search_lr = GridSearchCV(estimator=logistic_regression,
param_grid=param_grid_lr, cv=5, scoring='f1_weighted')

Logistic Regression with Principal Component Analysis (PCA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_lr.fit(X_train_resampled_pca, y_train_resampled_pca)

# Extracting the best parameters


best_params_pca = grid_search_lr.best_params_

# Training Logistic Regression with best parameters


logistic_regression_best_pca = LogisticRegression(max_iter=1000,
C=best_params_pca['C'])
logistic_regression_best_pca.fit(X_train_resampled_pca,
y_train_resampled_pca)

# Evaluation on test set


y_pred_lr_pca = logistic_regression_best_pca.predict(X_test_pca)
print("Logistic Regression with Principal Component Analysis (PCA)
Classification Report:")
print(classification_report(y_test_pca, y_pred_lr_pca))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_lr_pca = confusion_matrix(y_test_pca, y_pred_lr_pca)
# Plotting the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr_pca, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_pca),
yticklabels=np.unique(y_test_pca))
plt.title(f'Confusion Matrix for Logistic Regression with Principal
Component Analysis (PCA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Logistic Regression with Linear Discriminant Analysis (LDA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_lr.fit(X_train_resampled_lda, y_train_resampled_lda)

# Extracting the best parameters


best_params_lda = grid_search_lr.best_params_

# Training Logistic Regression with best parameters


logistic_regression_best_lda = LogisticRegression(max_iter=1000,
C=best_params_lda['C'])
logistic_regression_best_lda.fit(X_train_resampled_lda,
y_train_resampled_lda)

# Evaluation on test set


y_pred_lr_lda = logistic_regression_best_lda.predict(X_test_lda)
print("Logistic Regression with Linear Discriminant Analysis (LDA)
Classification Report:")
print(classification_report(y_test_lda, y_pred_lr_lda))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_lr_lda = confusion_matrix(y_test_lda, y_pred_lr_lda)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr_lda, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_lda),
yticklabels=np.unique(y_test_lda))
plt.title(f'Confusion Matrix for Logistic Regression with Linear
Discriminant Analysis (LDA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Logistic Regression with Feature Selection

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_lr.fit(X_train_resampled_fs, y_train_resampled_fs)

# Extracting the best parameters


best_params_fs = grid_search_lr.best_params_
# Training Logistic Regression with best parameters
logistic_regression_best_fs = LogisticRegression(max_iter=1000,
C=best_params_fs['C'])
logistic_regression_best_fs.fit(X_train_resampled_fs, y_train_resampled_fs)

# Evaluation on test set


y_pred_lr_fs = logistic_regression_best_fs.predict(X_test_fs)
print("Logistic Regression with Feature Selection Classification Report:")
print(classification_report(y_test_fs, y_pred_lr_fs))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_lr_fs = confusion_matrix(y_test_fs, y_pred_lr_fs)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr_fs, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_fs),
yticklabels=np.unique(y_test_fs))
plt.title(f'Confusion Matrix for Logistic Regression with Feature
Selection')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Random Forest

In [ ]:
# Random Forest Classifier model with GridSearchCV for hyperparameter
tuning
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(estimator=rf_classifier,
param_grid=param_grid_rf, cv=5, scoring='f1_weighted')

Random Forest with Prinicipal Component Analysis (PCA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_rf.fit(X_train_resampled_pca, y_train_resampled_pca)

# Extracting the best parameters and the best estimator


best_params_rf_pca = grid_search_rf.best_params_
best_estimator_rf_pca = grid_search_rf.best_estimator_

# Evaluation on test set


y_pred_rf_pca = best_estimator_rf_pca.predict(X_test_pca)
print("Random Forest with Principal Component Analysis (PCA) Classification
Report:")
print(classification_report(y_test_pca, y_pred_rf_pca))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_rf_pca = confusion_matrix(y_test_pca, y_pred_rf_pca)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_pca, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_pca),
yticklabels=np.unique(y_test_pca))
plt.title(f'Confusion Matrix for Random Forest with Principal Component
Analysis (PCA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Random Forest with Linear Discriminant Analysis (LDA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_rf.fit(X_train_resampled_lda, y_train_resampled_lda)

# Extracting the best parameters and the best estimator


best_params_rf_lda = grid_search_rf.best_params_
best_estimator_rf_lda = grid_search_rf.best_estimator_

# Evaluation on test set


y_pred_rf_lda = best_estimator_rf_lda.predict(X_test_lda)
print("Random Forest with Linear Discriminant Analysis (LDA) Classification
Report:")
print(classification_report(y_test_lda, y_pred_rf_lda))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_rf_lda = confusion_matrix(y_test_lda, y_pred_rf_lda)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_lda, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_lda),
yticklabels=np.unique(y_test_lda))
plt.title(f'Confusion Matrix for Random Forest with Linear Discriminant
Analysis (LDA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Random Forest with Feature Selection

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_rf.fit(X_train_resampled_fs, y_train_resampled_fs)

# Extracting the best parameters and the best estimator


best_params_rf_fs = grid_search_rf.best_params_
best_estimator_rf_fs = grid_search_rf.best_estimator_
# Evaluation on test set
y_pred_rf_fs = best_estimator_rf_fs.predict(X_test_fs)
print("Random Forest with Feature Selection Classification Report:")
print(classification_report(y_test_fs, y_pred_rf_fs))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_rf_fs = confusion_matrix(y_test_fs, y_pred_rf_fs)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_fs, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_fs),
yticklabels=np.unique(y_test_fs))
plt.title(f'Confusion Matrix for Random Forest with Feature Selection')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Decision Tree

In [ ]:
# Decision Tree Classifier model with GridSearchCV for hyperparameter
tuning
dt_classifier = DecisionTreeClassifier(random_state=42)
param_grid_dt = {
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search_dt = GridSearchCV(estimator=dt_classifier,
param_grid=param_grid_dt, cv=5, scoring='f1_weighted')

Decision Tree with Prinicipal Component Analysis (PCA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_dt.fit(X_train_resampled_pca, y_train_resampled_pca)

# Extract the best parameters and the best estimator


best_params_dt_pca = grid_search_dt.best_params_
best_estimator_dt_pca = grid_search_dt.best_estimator_

# Evaluation on test set


y_pred_dt_pca = best_estimator_dt_pca.predict(X_test_pca)
print("Decision Tree with Principal Component Analysis (PCA) Classification
Report:")
print(classification_report(y_test_pca, y_pred_dt_pca))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_dt_pca = confusion_matrix(y_test_pca, y_pred_dt_pca)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_dt_pca, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_pca),
yticklabels=np.unique(y_test_pca))
plt.title(f'Confusion Matrix for Decision Tree with Principal Component
Analysis (PCA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Decision Tree with Linear Discriminant Analysis (LDA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_dt.fit(X_train_resampled_lda, y_train_resampled_lda)

# Extract the best parameters and the best estimator


best_params_dt_lda = grid_search_dt.best_params_
best_estimator_dt_lda = grid_search_dt.best_estimator_

# Evaluation on test set


y_pred_dt_lda = best_estimator_dt_lda.predict(X_test_lda)
print("Decision Tree with Linear Discriminant Analysis (LDA) Classification
Report:")
print(classification_report(y_test_lda, y_pred_dt_lda))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_dt_lda = confusion_matrix(y_test_lda, y_pred_dt_lda)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_dt_lda, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_lda),
yticklabels=np.unique(y_test_lda))
plt.title(f'Confusion Matrix for Decision Tree with Linear Discriminant
Analysis (LDA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Decision Tree with Feature Selection

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_dt.fit(X_train_resampled_fs, y_train_resampled_fs)

# Extract the best parameters and the best estimator


best_params_dt_fs = grid_search_dt.best_params_
best_estimator_dt_fs = grid_search_dt.best_estimator_

# Evaluation on test set


y_pred_dt_fs = best_estimator_dt_fs.predict(X_test_fs)
print("Decision Tree with Feature Selection Classification Report:")
print(classification_report(y_test_fs, y_pred_dt_fs))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_dt_fs = confusion_matrix(y_test_fs, y_pred_dt_fs)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_dt_fs, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_fs),
yticklabels=np.unique(y_test_fs))
plt.title(f'Confusion Matrix for Decision Tree with Feature Selection')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

XGBoost

In [ ]:
xgb_classifier = XGBClassifier(random_state=42)
param_grid_xgb = {
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.1, 0.2],
'n_estimators': [100, 200, 300],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'gamma': [0, 0.1, 0.2]
}
grid_search_xgb = GridSearchCV(estimator=xgb_classifier,
param_grid=param_grid_xgb, cv=5, scoring='f1_weighted')

XGBoost with Prinicipal Component Analysis (PCA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_xgb.fit(X_train_resampled_pca, y_train_resampled_pca)

# Extracting the best parameters and the best estimator


best_params_xgb_pca = grid_search_xgb.best_params_
best_estimator_xgb_pca = grid_search_xgb.best_estimator_

# Evaluation on test set


y_pred_xgb_pca = best_estimator_xgb_pca.predict(X_test_pca)
print("XGBoost with Prinicipal Component Analysis (PCA) Classification
Report:")
print(classification_report(y_test_pca, y_pred_xgb_pca))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_xgb_pca = confusion_matrix(y_test_pca, y_pred_xgb_pca)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb_pca, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_pca),
yticklabels=np.unique(y_test_pca))
plt.title(f'Confusion Matrix for XGBoost with Prinicipal Component Analysis
(PCA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

XGBoost with Linear Discriminant Analysis (LDA)

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_xgb.fit(X_train_resampled_lda, y_train_resampled_lda)

# Extracting the best parameters and the best estimator


best_params_xgb_lda = grid_search_xgb.best_params_
best_estimator_xgb_lda = grid_search_xgb.best_estimator_

# Evaluation on test set


y_pred_xgb_lda = best_estimator_xgb_lda.predict(X_test_lda)
print("XGBoost with Linear Discriminant Analysis (LDA) Classification
Report:")
print(classification_report(y_test_lda, y_pred_xgb_lda))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_xgb_lda = confusion_matrix(y_test_lda, y_pred_xgb_lda)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb_lda, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_lda),
yticklabels=np.unique(y_test_lda))
plt.title(f'Confusion Matrix for XGBoost with Linear Discriminant Analysis
(LDA)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

XGBoost with Feature Selection

In [ ]:
# Fitting GridSearchCV to the training data
grid_search_xgb.fit(X_train_resampled_fs, y_train_resampled_fs)

# Extracting the best parameters and the best estimator


best_params_xgb_fs = grid_search_xgb.best_params_
best_estimator_xgb_fs = grid_search_xgb.best_estimator_

# Evaluation on test set


y_pred_xgb_fs = best_estimator_xgb_fs.predict(X_test_fs)
print("XGBoost with Feature Selection Classification Report:")
print(classification_report(y_test_fs, y_pred_xgb_fs))
In [ ]:
# Generating the Confusion Matrix for the test set
cm_xgb_fs = confusion_matrix(y_test_fs, y_pred_xgb_fs)

# Plotting the Confusion Matrix


plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb_fs, annot=True, fmt='d', cmap='Blues',
xticklabels=np.unique(y_test_fs),
yticklabels=np.unique(y_test_fs))
plt.title(f'Confusion Matrix for XGBoost with Feature Selection')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Model Evaluation and Comparison

In [ ]:
# F1-Score weighted average for each model
f1_scores_pca = {
'Logistic Regression': f1_score(y_test_pca, y_pred_lr_pca,
average='weighted'),
'Random Forest': f1_score(y_test_pca, y_pred_rf_pca,
average='weighted'),
'Decision Tree': f1_score(y_test_pca, y_pred_dt_pca,
average='weighted'),
'XGBoost': f1_score(y_test_pca, y_pred_xgb_pca, average='weighted')
}

f1_scores_lda = {
'Logistic Regression': f1_score(y_test_lda, y_pred_lr_lda,
average='weighted'),
'Random Forest': f1_score(y_test_lda, y_pred_rf_lda,
average='weighted'),
'Decision Tree': f1_score(y_test_lda, y_pred_dt_lda,
average='weighted'),
'XGBoost': f1_score(y_test_lda, y_pred_xgb_lda, average='weighted')
}

f1_scores_fs = {
'Logistic Regression': f1_score(y_test_fs, y_pred_lr_fs,
average='weighted'),
'Random Forest': f1_score(y_test_fs, y_pred_rf_fs, average='weighted'),
'Decision Tree': f1_score(y_test_fs, y_pred_dt_fs, average='weighted'),
'XGBoost': f1_score(y_test_fs, y_pred_xgb_fs, average='weighted')
}

# Extracting model names and F1 scores for PCA and LDA


models = list(f1_scores_pca.keys())
f1_pca = list(f1_scores_pca.values())
f1_lda = list(f1_scores_lda.values())
f1_fs = list(f1_scores_fs.values())

# Setting the width of the bars


bar_width = 0.25

# Setting position of bar on X axis


r1 = np.arange(len(models))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width*2 for x in r1]

# Plotting the bar graph


plt.figure(figsize=(10, 6))
plt.bar(r1, f1_pca, color='blue', width=bar_width, edgecolor='grey',
label='Principal Component Analysis')
plt.bar(r2, f1_lda, color='green', width=bar_width, edgecolor='grey',
label='Linear Discriminant Analysis')
plt.bar(r3, f1_fs, color='orange', width=bar_width, edgecolor='grey',
label='Feature Selection')

# Adding labels
plt.xlabel('Models')
plt.ylabel('F1-Score Weighted Average')
plt.xticks([r + bar_width for r in range(len(models))], models)
plt.ylim(0, 1)
plt.title('F1-Score Weighted Average for Different Models (PCA vs LDA vs
Feature Selection)')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Showing the plot


plt.show()

You might also like