0% found this document useful (0 votes)
11 views12 pages

Fyp 4

The document outlines a process for loading, preprocessing, and classifying network traffic data using ensemble machine learning techniques. It includes steps for handling missing values, defining attack classifications, training a VotingClassifier model, and evaluating its performance with accuracy and classification reports. Finally, it saves the model and labeled datasets for future use.

Uploaded by

Nife Ali
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views12 pages

Fyp 4

The document outlines a process for loading, preprocessing, and classifying network traffic data using ensemble machine learning techniques. It includes steps for handling missing values, defining attack classifications, training a VotingClassifier model, and evaluating its performance with accuracy and classification reports. Finally, it saves the model and labeled datasets for future use.

Uploaded by

Nife Ali
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 12

import os

import pandas as pd

# Paths to your train and test folders


train_folder = 'csv/train'
test_folder = 'csv/test'

# Function to read and combine CSV files


def load_csv_files(folder_path):
dataframes = []
for file in os.listdir(folder_path):
if file.endswith('.csv'):
file_path = os.path.join(folder_path, file)
dataframes.append(pd.read_csv(file_path))
return pd.concat(dataframes, ignore_index=True)

# Load train and test data


train_data = load_csv_files(train_folder)
test_data = load_csv_files(test_folder)

print(f"Train Data Shape: {train_data.shape}")


print(f"Test Data Shape: {test_data.shape}")

Train Data Shape: (7550787, 45)


Test Data Shape: (1614182, 45)

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(data):
# Step 1: Drop the 'drate' column if it exists
if 'drate' in data.columns:
data = data.drop(columns=['drate'])

# Step 2: Handle missing values by replacing with the mean


imputer = SimpleImputer(strategy='mean')
numeric_cols = data.select_dtypes(include=['float64',
'int64']).columns
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

return data

# Apply preprocessing to train and test datasets

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Define classification rules for attack types


def classify_attack(row):
"""
Classifies attacks based on numeric data in the row.
"""
# Example conditions (adjust according to your data analysis):
# DDoS: High Rate and Srate
if row['Rate'] > 10000 and row['Srate'] > 5000:
return 1 # DDoS

# DoS: Lower than DDoS rates but still high


elif row['Rate'] > 5000 and row['Srate'] > 2000:
return 2 # DoS

# Recon: Based on specific Protocol Type or related features


elif row['Protocol Type'] in [3, 5, 6]: # Example numeric
protocol values
return 3 # Recon

# Spoofing: Low rate with unusual ARP traffic


elif row['ARP'] > 0 and row['Rate'] < 1000:
return 4 # Spoofing

# MQTT Specific Attacks: Based on high MQTT-related values


elif row['Protocol Type'] == 10 and row['Rate'] > 3000: #
Example numeric value
return 5 # MQTT Specific

# Normal Traffic
else:
return 0 # Normal

# Apply classification to train and test datasets


train_data['attack_class'] = train_data.apply(classify_attack, axis=1)

import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier,
GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
# Features and target variable
X_train = train_data[['Srate', 'Rate', 'Protocol Type', 'ARP']] #
Main features
y_train = train_data['attack_class']

X_test = test_data[['Srate', 'Rate', 'Protocol Type', 'ARP']]

# Ensemble learning: VotingClassifier with RandomForest and


GradientBoosting
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100,
random_state=42)

ensemble_model = VotingClassifier(
estimators=[('rf', rf_model), ('gb', gb_model)],
voting='soft'
)

# Train the ensemble model


ensemble_model.fit(X_train, y_train)

VotingClassifier(estimators=[('rf',
RandomForestClassifier(random_state=42)),
('gb',

GradientBoostingClassifier(random_state=42))],
voting='soft')

# Save the Model


import pickle
with open("ensemble_model.pkl", "wb") as file:
pickle.dump(ensemble_model, file)

print("Model saved as ensemble_model.pkl")

Model saved as ensemble_model.pkl

import joblib

# Save the trained ensemble model


joblib.dump(ensemble_model, 'ensemble_model.joblib')

print("Model saved as ensemble_model.joblib")

Model saved as ensemble_model.joblib

test_data['attack_class'] = test_data.apply(classify_attack, axis=1)

# Predict on the test set


y_test_pred = ensemble_model.predict(X_test)
# Save predictions back to test data
test_data['predicted_attack_class'] = y_test_pred

# Save the test dataset with predictions


test_data.to_csv('test_dataset_predictions_ensemble.csv', index=False)

# Evaluate the model (optional if labeled test data is available)


if 'attack_class' in test_data.columns:
print("Classification Report:")
print(classification_report(test_data['attack_class'],
y_test_pred))
print("Accuracy Score:", accuracy_score(test_data['attack_class'],
y_test_pred))

Classification Report:
precision recall f1-score support

0 1.00 1.00 1.00 379103


1 1.00 1.00 1.00 636984
2 1.00 1.00 1.00 13518
3 1.00 1.00 1.00 577939
4 1.00 1.00 1.00 6638

accuracy 1.00 1614182


macro avg 1.00 1.00 1.00 1614182
weighted avg 1.00 1.00 1.00 1614182

Accuracy Score: 1.0

import matplotlib.pyplot as plt


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Check if 'attack_class' column exists for evaluation


if 'attack_class' in test_data.columns:
# Generate the confusion matrix
cm = confusion_matrix(test_data['attack_class'], y_test_pred)
labels = test_data['attack_class'].unique()

# Display the confusion matrix


disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=labels)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()
else:
print("No true labels ('attack_class') found in test data.
Confusion matrix cannot be generated.")
# Identify columns with NaN in the correlation matrix
nan_columns =
correlation_matrix.columns[correlation_matrix.isna().any()]
print("Columns with NaN values in correlation matrix:\n", nan_columns)

Columns with NaN values in correlation matrix:


Index(['Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate',
'Drate',
'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count',
'rst_count',
'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
'UDP',
'DHCP', 'ARP', 'ICMP', 'IGMP', 'IPv', 'LLC', 'Tot sum', 'Min',
'Max',
'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
'Radius',
'Covariance', 'Variance', 'Weight', 'dataset'],
dtype='object')

import matplotlib.pyplot as plt


# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(rf_model_weighted.feature_names_in_,
rf_model_weighted.feature_importances_)
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance Analysis")
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# Define attack type mapping for labels


attack_type_mapping = {
0: "Normal",
1: "DDoS",
2: "DoS",
3: "Recon",
4: "Spoofing",

# Function to map numeric predictions to attack type names with error


handling
def map_attack_labels(predictions):
return [attack_type_mapping.get(label, "Unknown") for label in
predictions]

# Check if 'attack_class' exists for evaluation


if 'attack_class' in test_data.columns:
# Classification Report
report = classification_report(
test_data['attack_class'],
y_test_pred,
labels=[0, 1, 2, 3,4],
target_names=list(attack_type_mapping.values()),
output_dict=True
)

# Convert the classification report to a DataFrame for better


visualization
report_df = pd.DataFrame(report).transpose()
print("Classification Report:")
print(report_df)

# Plot the metrics (Precision, Recall, F1-Score)


metrics = report_df.loc[["DDoS", "DoS", "Recon", "Spoofing"],
["precision", "recall", "f1-score"]]
metrics.plot(kind="bar", figsize=(10, 6))
plt.title("Classification Metrics by Attack Type")
plt.xlabel("Attack Type")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

Classification Report:
precision recall f1-score support
Normal 1.0 1.0 1.0 379103.0
DDoS 1.0 1.0 1.0 636984.0
DoS 1.0 1.0 1.0 13518.0
Recon 1.0 1.0 1.0 577939.0
Spoofing 1.0 1.0 1.0 6638.0
accuracy 1.0 1.0 1.0 1.0
macro avg 1.0 1.0 1.0 1614182.0
weighted avg 1.0 1.0 1.0 1614182.0
from sklearn.metrics import classification_report, confusion_matrix,
accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Predict on the test set


y_test_pred = ensemble_model.predict(X_test)

# If ground truth labels are available


if 'attack_class' in test_data.columns:
# Calculate accuracy
accuracy = accuracy_score(test_data['attack_class'], y_test_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate and display a classification report


class_report = classification_report(test_data['attack_class'],
y_test_pred, target_names=[
"Normal", "DDoS", "DoS", "Recon", "Spoofing"])
print("Classification Report:")
print(class_report)

# Confusion Matrix
cm = confusion_matrix(test_data['attack_class'], y_test_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[
"Normal", "DDoS", "DoS", "Recon", "Spoofing" ],
yticklabels=[
"Normal", "DDoS", "DoS", "Recon", "Spoofing" ])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
else:
print("Ground truth labels are not available in the test
dataset.")

# Save predictions in the test dataset


test_data['predicted_attack_class'] = y_test_pred

# Map predictions to their corresponding attack names


attack_type_mapping = {
0: "Normal",
1: "DDoS",
2: "DoS",
3: "Recon",
4: "Spoofing",

}
test_data['predicted_attack_name'] =
test_data['predicted_attack_class'].map(attack_type_mapping)

# Display a sample of predictions


print("Sample of Predictions:")
print(test_data[['predicted_attack_class',
'predicted_attack_name']].head())

# Visualization of predictions
predictions_count = test_data['predicted_attack_name'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=predictions_count.index, y=predictions_count.values,
palette="viridis")
plt.title("Distribution of Predicted Attack Classes")
plt.xlabel("Attack Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

Accuracy: 1.00
Classification Report:
precision recall f1-score support

Normal 1.00 1.00 1.00 379103


DDoS 1.00 1.00 1.00 636984
DoS 1.00 1.00 1.00 13518
Recon 1.00 1.00 1.00 577939
Spoofing 1.00 1.00 1.00 6638
accuracy 1.00 1614182
macro avg 1.00 1.00 1.00 1614182
weighted avg 1.00 1.00 1.00 1614182

Sample of Predictions:
predicted_attack_class predicted_attack_name
0 0 Normal
1 0 Normal
2 1 DDoS
3 1 DDoS
4 0 Normal

C:\Users\Students\AppData\Local\Temp\7\
ipykernel_32976\1383133409.py:54: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.barplot(x=predictions_count.index, y=predictions_count.values,
palette="viridis")

import os

# Paths to save labeled data


output_folder = 'labeled_data'
os.makedirs(output_folder, exist_ok=True)

# Save labeled train data


train_output_path = os.path.join(output_folder,
'labeled_train_data.csv')
train_data.to_csv(train_output_path, index=False)
print(f"Labeled train data saved to {train_output_path}")

# Save labeled test data


test_output_path = os.path.join(output_folder,
'labeled_test_data.csv')
test_data.to_csv(test_output_path, index=False)
print(f"Labeled test data saved to {test_output_path}")
Labeled train data saved to labeled_data\labeled_train_data.csv
Labeled test data saved to labeled_data\labeled_test_data.csv

You might also like