20mis1025 Lab1
20mis1025 Lab1
ipynb - Colaboratory
#data preprocessing
#importing the libraries
import os
from pathlib import Path
import pandas as pd
df = pd.read_csv("/content/KDD_Train.csv")
import warnings
warnings.filterwarnings('ignore')
df.shape
(125973, 42)
print(df.shape)
df.head(10)
(125973, 42)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot .
2 0 tcp private S0 0 0 0 0 0 0
6 0 tcp private S0 0 0 0 0 0 0
7 0 tcp private S0 0 0 0 0 0 0
8 0 tcp remote_job S0 0 0 0 0 0 0
9 0 tcp private S0 0 0 0 0 0 0
10 rows × 42 columns
#DATA PREPROCESSING
df.replace(('normal','anomaly'), (0,1), inplace=True)
df.head(10)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot .
2 0 tcp private S0 0 0 0 0 0 0
6 0 tcp private S0 0 0 0 0 0 0
7 0 tcp private S0 0 0 0 0 0 0
8 0 tcp remote_job S0 0 0 0 0 0 0
9 0 tcp private S0 0 0 0 0 0 0
10 rows × 42 columns
#CATEGORICAL FEATURES
for column_name in df.columns:
if df[column_name].dtypes=='object':
a =df[column_name].unique()
https://colab.research.google.com/drive/1D5c73elvUIzMA3cDeuy4uoBN2qa9G9aB#scrollTo=CSuMQbcTPcFB&printMode=true 1/3
7/26/23, 11:36 PM Lab1_pandas.ipynb - Colaboratory
a=len(a)
#print(a)
print(column_name+ " has "+ str(a) +" unique values. ")
#CONVERT CATEGORICAL DATA INTO BINARY VARIABLES BY ONE HOT ENCODING
df['protocol_type'].head(5)
0 tcp
1 udp
2 tcp
3 tcp
4 tcp
Name: protocol_type, dtype: object
df['protocol_type'].value_counts()
tcp 102689
udp 14993
icmp 8291
Name: protocol_type, dtype: int64
print(pd.get_dummies(df['protocol_type']).head(5))
def dummy_df(df):
todummy_list = ['protocol_type', 'service','flag']
for x in todummy_list:
#dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
#dummy_na=False: If false NaNs are ignored. If true, add col to indicate Nans
dummies=pd.get_dummies(df[x],dummy_na=False)
df = df.drop(x, 1)
#Drop label coln.
df = pd.concat([df, dummies], axis=1)
#concat along columns.
return df
#Appling one hot encoding function
df = dummy_df(df)
df.head(5)
duration src_bytes dst_bytes land wrong_fragment urgent hot num_failed_logins logged_in num_
0 0 491 0 0 0 0 0 0 0
1 0 146 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0
3 0 232 8153 0 0 0 0 0 1
4 0 199 420 0 0 0 0 0 1
# Checking how much of my data is missing?
df.isnull().sum().sort_values(ascending=False).head()
duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64
# Impute missing values using Imputer in sklearn.preprocessing
import numpy as np
f kl i t i t Si l I t
https://colab.research.google.com/drive/1D5c73elvUIzMA3cDeuy4uoBN2qa9G9aB#scrollTo=CSuMQbcTPcFB&printMode=true 2/3
7/26/23, 11:36 PM Lab1_pandas.ipynb - Colaboratory
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr.fit(df)
df = pd.DataFrame(data=imr.transform(df), columns=df.columns)
df.isnull().sum().sort_values(ascending=False).head()
duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64
X = df.drop ('class', 1) # Dropping target, train_features = train.iloc[:,:41]
y = df['class'] #train_target = train.class
X.shape
(125973, 122)
https://colab.research.google.com/drive/1D5c73elvUIzMA3cDeuy4uoBN2qa9G9aB#scrollTo=CSuMQbcTPcFB&printMode=true 3/3