3/19/24, 11:44 PM lab7.
ipynb - Colaboratory
Name Hameed Ullah
211086-A
AI Lab TASK 7
# Importing necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# Load the dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/KDDTrain+.txt"
column_names = ["duration", "protocol_type", "attack_type"]
data = pd.read_csv(file_path, header=None, names=column_names)
# Preprocessing
selected_columns = ["duration", "protocol_type", "attack_type"]
preprocessed_data = data[selected_columns]
# Check for missing values
print(preprocessed_data.isnull().sum())
# Splitting the dataset into features (x) and target (y)
x = preprocessed_data.drop(columns=["attack_type"])
y = preprocessed_data["attack_type"]
# Splitting the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
# Check data types
print(x_train.dtypes)
# Convert 'protocol_type' to numeric using label encoding
label_encoder = LabelEncoder()
x_train['protocol_type_encoded'] = label_encoder.fit_transform(x_train['protocol_type'])
# Extract x and y variables for visualization
x_set = x_train[['duration', 'protocol_type_encoded']].values
y_set = y_train.values
# Check data types after label encoding
print(x_set.dtype)
print(np.isnan(x_set).sum())
# Fitting Decision Tree classifier to the training set
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(x_set, y_set)
# Meshgrid creation
x1_min, x1_max = x_set[:, 0].min() - 1, x_set[:, 0].max() + 1
x2_min, x2_max = x_set[:, 1].min() - 1, x_set[:, 1].max() + 1
x1, x2 = np.meshgrid(np.arange(start=x1_min, stop=x1_max, step=1),
np.arange(start=x2_min, stop=x2_max, step=1))
# Plotting the decision boundary
plt.figure(figsize=(10, 6))
plt.contourf(x1, x2, classifier.predict(np.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape),
alpha=0.75, cmap=ListedColormap(('purple', 'green')))
# Plotting the data points
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(('purple', 'green'))(i), label=j)
# Setting plot labels and legend
plt.xlim(x1.min(), x1.max())
https://colab.research.google.com/drive/1QkVj8I_GkxQLWkwssl4-iFD1oQmMIZ9Y#scrollTo=51BptnM6_KHL&printMode=true 1/4
3/19/24, 11:44 PM lab7.ipynb - Colaboratory
p ( (), ())
plt.ylim(x2.min(), x2.max())
plt.title('Decision Tree Algorithm (Training set)')
plt.xlabel('duration')
plt.ylabel('protocol_type_encoded')
plt.legend()
# Show plot
plt.show()
duration 0
protocol_type 0
attack_type 0
dtype: int64
duration float64
protocol_type object
dtype: object
float64
0
<ipython-input-25-49c950067830>:61: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, wh
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
# Importing necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# Load the dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/KDDTest+.txt"
column_names = ["duration", "protocol_type", "attack_type"]
data = pd.read_csv(file_path, header=None, names=column_names)
# Preprocessing
selected_columns = ["duration", "protocol_type", "attack_type"]
preprocessed_data = data[selected_columns]
# Check for missing values
i t( d d t i ll() ())
https://colab.research.google.com/drive/1QkVj8I_GkxQLWkwssl4-iFD1oQmMIZ9Y#scrollTo=51BptnM6_KHL&printMode=true 2/4
3/19/24, 11:44 PM lab7.ipynb - Colaboratory
print(preprocessed_data.isnull().sum())
# Splitting the dataset into features (x) and target (y)
x = preprocessed_data.drop(columns=["attack_type"])
y = preprocessed_data["attack_type"]
# Splitting the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
# Check data types
print(x_train.dtypes)
# Convert 'protocol_type' to numeric using label encoding
label_encoder = LabelEncoder()
x_train['protocol_type_encoded'] = label_encoder.fit_transform(x_train['protocol_type'])
# Extract x and y variables for visualization
x_set = x_train[['duration', 'protocol_type_encoded']].values
y_set = y_train.values
# Check data types after label encoding
print(x_set.dtype)
print(np.isnan(x_set).sum())
# Fitting Decision Tree classifier to the training set
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(x_set, y_set)
# Meshgrid creation
x1_min, x1_max = x_set[:, 0].min() - 1, x_set[:, 0].max() + 1
x2_min, x2_max = x_set[:, 1].min() - 1, x_set[:, 1].max() + 1
x1, x2 = np.meshgrid(np.arange(start=x1_min, stop=x1_max, step=1),
np.arange(start=x2_min, stop=x2_max, step=1))
# Plotting the decision boundary
plt.figure(figsize=(10, 6))
plt.contourf(x1, x2, classifier.predict(np.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape),
alpha=0.75, cmap=ListedColormap(('purple', 'green')))
# Plotting the data points
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(('purple', 'green'))(i), label=j)
# Setting plot labels and legend
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
plt.title('Decision Tree Algorithm (testing set)')
plt.xlabel('duration')
plt.ylabel('protocol_type_encoded')
plt.legend()
# Show plot
plt.show()
https://colab.research.google.com/drive/1QkVj8I_GkxQLWkwssl4-iFD1oQmMIZ9Y#scrollTo=51BptnM6_KHL&printMode=true 3/4
3/19/24, 11:44 PM lab7.ipynb - Colaboratory
duration 0
protocol_type 0
attack_type 0
dtype: int64
duration float64
protocol_type object
dtype: object
float64
0
<ipython-input-26-72b12f93fe69>:61: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, wh
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
https://colab.research.google.com/drive/1QkVj8I_GkxQLWkwssl4-iFD1oQmMIZ9Y#scrollTo=51BptnM6_KHL&printMode=true 4/4