0% found this document useful (0 votes)
2 views

code 1

The document outlines a data analysis workflow using Python libraries to load datasets, visualize fault distributions, and preprocess data for machine learning. It employs PCA for dimensionality reduction and trains Random Forest and SVM models, evaluating their performance with confusion matrices and classification reports. The results highlight the accuracy and effectiveness of the models in classifying faults in the dataset.

Uploaded by

20112042paren
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

code 1

The document outlines a data analysis workflow using Python libraries to load datasets, visualize fault distributions, and preprocess data for machine learning. It employs PCA for dimensionality reduction and trains Random Forest and SVM models, evaluating their performance with confusion matrices and classification reports. The results highlight the accuracy and effectiveness of the models in classifying faults in the dataset.

Uploaded by

20112042paren
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mpl_toolkits.mplot3d import Axes3D

# Set plot styles


plt.rcParams.update({'font.size': 14, "font.family": "Times New Roman"})

# ----------------------
# Load Datasets
# ----------------------
df1 = pd.read_csv(r'/F0L.csv, /F1L.csv, /F5L.csv')
df2 = pd.read_csv(r'/F0M.csv')

# ----------------------
# Data Visualization: Fault Distribution
# ----------------------
plt.figure(figsize=(5, 5))
plt.pie(df1['label'].value_counts(), labels=df1['label'].unique(), autopct='%2.1f%
%', colors=sns.color_palette("pastel"))
plt.title("Fault Distribution in Limited Power Dataset")
plt.show()

# ----------------------
# Feature Correlation Heatmap
# ----------------------
X = df1.iloc[:, 1:-1] # Features
Y = df1.iloc[:, -1] # Target Labels

plt.figure(figsize=(12, 10))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

# ----------------------
# Data Preprocessing
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,
shuffle=True, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------
# PCA: Dimensionality Reduction
# ----------------------
for i in range(1, 14):
pca = PCA(n_components=i)
pca.fit(X_train_scaled)
print(f"Cumulative explained variance for {i} components:
{np.sum(pca.explained_variance_ratio_):.4f}")
# Apply PCA with 2 components
pca_2 = PCA(n_components=2)
X_pca_2D = pca_2.fit_transform(X_train_scaled)
principalDf = pd.DataFrame(data=X_pca_2D, columns=['PC1', 'PC2'])
principalDf['Fault'] = np.array(y_train)

plt.figure(figsize=(10, 8))
sns.scatterplot(x=principalDf['PC1'], y=principalDf['PC2'],
hue=principalDf['Fault'], palette='Dark2', alpha=0.7)
plt.title("2D PCA Visualization")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

# 3D PCA Visualization
pca_3 = PCA(n_components=3)
X_pca_3D = pca_3.fit_transform(X_train_scaled)
principalDf = pd.DataFrame(data=X_pca_3D, columns=['PC1', 'PC2', 'PC3'])
principalDf['Fault'] = np.array(y_train)

fig = plt.figure(figsize=(10, 10))


ax = fig.add_subplot(111, projection='3d')
colors = sns.color_palette("husl", len(y_train.unique()))
for fault, color in zip(y_train.unique(), colors):
indices = principalDf['Fault'] == fault
ax.scatter(principalDf.loc[indices, 'PC1'],
principalDf.loc[indices, 'PC2'],
principalDf.loc[indices, 'PC3'],
c=[color], label=fault, s=50)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D PCA Visualization')
ax.legend()
plt.show()

# ----------------------
# Train & Evaluate Models
# ----------------------
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)


svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

# ----------------------
# Confusion Matrix Plot
# ----------------------
def plot_confusion_matrix(y_true, y_pred, model_name):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="viridis",
xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title(f"Confusion Matrix - {model_name}")
plt.show()
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
plot_confusion_matrix(y_test, y_pred_svm, "SVM")

# ----------------------
# Final Model Evaluations
# ----------------------
print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))

print("\nSVM Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(classification_report(y_test, y_pred_svm))

You might also like