0% found this document useful (0 votes)

4 views

Implementing KNN Algorithm on the Iris Dataset

Uploaded by

chatborg

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

4 views

Implementing KNN Algorithm on the Iris Dataset

Uploaded by

chatborg

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 7

John Ndungu / Implementing KNN Algorithm on the Iris Dataset

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets

from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

#EDA on Iris Dataset

We are going to use a very famous dataset called Iris.
Attributes:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
We will just use two features for easier visualization, sepal length and width.
Class:
Iris Setosa
Iris Versicolour
Iris Virginica
#Load the Dataset

# import iris dataset

iris = datasets.load_iris()
# np.c_ is the numpy concatenate function
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
iris_df.head()

sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target

0 5.1 3.5 1.4 0.2 0.0

1 4.9 3.0 1.4 0.2 0.0

2 4.7 3.2 1.3 0.2 0.0

3 4.6 3.1 1.5 0.2 0.0

4 5.0 3.6 1.4 0.2 0.0

#Describe the Dataset

iris_df.describe()

sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target

count 150.000000 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.057333 3.758000 1.199333 1.000000

std 0.828066 0.435866 1.765298 0.762238 0.819232

min 4.300000 2.000000 1.000000 0.100000 0.000000

25% 5.100000 2.800000 1.600000 0.300000 0.000000

50% 5.800000 3.000000 4.350000 1.300000 1.000000

75% 6.400000 3.300000 5.100000 1.800000 2.000000

max 7.900000 4.400000 6.900000 2.500000 2.000000

#Split into X and Y

x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]

x.head()

y.head()

0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
Name: target, dtype: float64

#Split into training and testing

# split the data into train and test sets

x_train, x_test, y_train, y_test= train_test_split(x, y,
test_size= 0.2,
shuffle= True, #shuffle the data to avoid bias
random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)

print(f'training set size: {x_train.shape[0]} samples \ntest set size: {x_test.shape[0]} samples')

training set size: 120 samples

test set size: 30 samples

#Normalize the Dataset

scaler= Normalizer().fit(x_train) # the scaler is fitted to the training set

normalized_x_train= scaler.transform(x_train) # the scaler is applied to the training set
normalized_x_test= scaler.transform(x_test) # the scaler is applied to the test set
print('x train before Normalization')
print(x_train[0:5])
print('\nx train after Normalization')
print(normalized_x_train[0:5])

x train before Normalization

[[6.4 3.1 5.5 1.8]
[5.4 3. 4.5 1.5]
[5.2 3.5 1.5 0.2]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.2]]

x train after Normalization

[[0.69804799 0.338117 0.59988499 0.196326 ]
[0.69333409 0.38518561 0.57777841 0.1925928 ]
[0.80641965 0.54278246 0.23262105 0.03101614]
[0.71171214 0.35002236 0.57170319 0.21001342]
[0.69417747 0.30370264 0.60740528 0.2386235 ]]

#Visualize the Dataset before and after Normalization

## Before
# View the relationships between variables; color code by species type
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0:'Virginica'} # dictionary

before= sns.pairplot(iris_df.replace({'target': di}), hue= 'target')

before.fig.suptitle('Pair Plot of the dataset Before normalization', y=1.08)

## After
iris_df_2= pd.DataFrame(data= np.c_[normalized_x_train, y_train],
columns= iris['feature_names'] + ['target'])
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0: 'Virginica'}
after= sns.pairplot(iris_df_2.replace({'target':di}), hue= 'target')
after.fig.suptitle('Pair Plot of the dataset After normalization', y=1.08)

Text(0.5, 1.08, 'Pair Plot of the dataset After normalization')

#KNN Step 1 (Euclidean Distance)

def distance_ecu(x_train, x_test_point):

"""
Input:
- x_train: corresponding to the training data
- x_test_point: corresponding to the test point

Output:
-distances: The distances between the test point and each point in the training data.

"""
distances= [] ## create empty list called distances
for row in range(len(x_train)): ## Loop over the rows of x_train
current_train_point= x_train[row] #Get them point by point
current_distance= 0 ## initialize the distance by zero

for col in range(len(current_train_point)): ## Loop over the columns of the row

current_distance += (current_train_point[col] - x_test_point[col]) **2

## Or current_distance = current_distance + (x_train[i] - x_test_point[i])**2
current_distance= np.sqrt(current_distance)

distances.append(current_distance) ## Append the distances

# Store distances in a dataframe

distances= pd.DataFrame(data=distances,columns=['dist'])
return distances

#KNN Step 2 (Find the nearest neighbors)

def nearest_neighbors(distance_point, K):
"""
Input:
-distance_point: the distances between the test point and each point in the training data.
-K : the number of neighbors

Output:
-df_nearest: the nearest K neighbors between the test point and the training data.

"""

# Sort values using the sort_values function

df_nearest= distance_point.sort_values(by=['dist'], axis=0)

## Take only the first K neighbors

df_nearest= df_nearest[:K]
return df_nearest

#KNN Step 3 (Classify the point based on a majority vote)

def voting(df_nearest, y_train):

"""
Input:
-df_nearest: dataframe contains the nearest K neighbors between the full training dataset and the test point.
-y_train: the labels of the training dataset.

Output:
-y_pred: the prediction based on Majority Voting

"""

## Use the Counter Object to get the labels with K nearest neighbors.
counter_vote= Counter(y_train[df_nearest.index])

y_pred= counter_vote.most_common()[0][0] # Majority Voting

return y_pred

#KNN Full Algorithm: Putting Everything Together

def KNN_from_scratch(x_train, y_train, x_test, K):

"""
Input:
-x_train: the full training dataset
-y_train: the labels of the training dataset
-x_test: the full test dataset
-K: the number of neighbors

Output:
-y_pred: the prediction for the whole test set based on Majority Voting.

"""

y_pred=[]

## Loop over all the test set and perform the three steps
for x_test_point in x_test:
distance_point = distance_ecu(x_train, x_test_point) ## Step 1
df_nearest_point= nearest_neighbors(distance_point, K) ## Step 2
y_pred_point = voting(df_nearest_point, y_train) ## Step 3
y_pred.append(y_pred_point)

return y_pred

#Test the KNN Algorithm on the test dataset

K=3
y_pred_scratch= KNN_from_scratch(normalized_x_train, y_train, normalized_x_test, K)
print(y_pred_scratch)

[2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0,
#Compare our implementation with Sklearn library

knn=KNeighborsClassifier(K)
knn.fit(normalized_x_train, y_train)
y_pred_sklearn= knn.predict(normalized_x_test)
print(y_pred_sklearn)

[2. 1. 0. 2. 0. 2. 0. 1. 1. 1. 2. 1. 1. 1. 1. 0. 1. 2. 0. 0. 2. 1. 0. 0.
2. 0. 0. 1. 1. 0.]

#Check if the output is exactly the same

print(np.array_equal(y_pred_sklearn, y_pred_scratch))

True

#Calculate the accuracy of both methods

print(f'The accuracy of our implementation is {accuracy_score(y_test, y_pred_scratch)}')

print(f'The accuracy of sklearn implementation is {accuracy_score(y_test, y_pred_sklearn)}')

The accuracy of our implementation is 0.9666666666666667

The accuracy of sklearn implementation is 0.9666666666666667

#Perform Hyper-parameter Tuning using K-fold Cross Validation

n_splits= 4 ## Choose the number of splits

kf= KFold(n_splits= n_splits) ## Call the K Fold function

accuracy_k= [] ## Keep track of the accuracy for each K

k_values= list(range(1,30,2)) ## Search for the best value of K

for k in k_values: ## Loop over the K values

accuracy_fold= 0
for normalized_x_train_fold_idx, normalized_x_valid_fold_idx in kf.split(normalized_x_train): ## Loop over the splits
normalized_x_train_fold= normalized_x_train[normalized_x_train_fold_idx] ## fetch the values
y_train_fold= y_train[normalized_x_train_fold_idx]

normalized_x_test_fold= normalized_x_train[normalized_x_valid_fold_idx]
y_valid_fold= y_train[normalized_x_valid_fold_idx]
y_pred_fold= KNN_from_scratch(normalized_x_train_fold, y_train_fold, normalized_x_test_fold, k)

accuracy_fold+= accuracy_score (y_pred_fold, y_valid_fold) ## Accumulate the accuracy

accuracy_fold= accuracy_fold/ n_splits ## Divide by the number of splits
accuracy_k.append(accuracy_fold)

print(f'The accuracy for each K value was {list ( zip (accuracy_k, k_values))}') ## creates a tuple with accuracy corresp

The accuracy for each K value was [(0.9666666666666668, 1), (0.9666666666666668, 3), (0.9666666666666668, 5), (0.9666666666666668, 7), (0.958

print(f'Best accuracy was {np.max(accuracy_k)}, which corresponds to a value of K= {k_values[np.argmax(accuracy_k)]}')

Best accuracy was 0.9666666666666668, which corresponds to a value of K= 1

AGROTRON 120-130-150-150.7-165.7-180.7 Profiline
No ratings yet
AGROTRON 120-130-150-150.7-165.7-180.7 Profiline
462 pages
Digital Micro 06
100% (1)
Digital Micro 06
5 pages
Business Analysis Quick Guide Part I
No ratings yet
Business Analysis Quick Guide Part I
1 page
LAB-4 Report
No ratings yet
LAB-4 Report
21 pages
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
No ratings yet
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
8 pages
Machine Learning Lab
No ratings yet
Machine Learning Lab
33 pages
DM ML Practical
No ratings yet
DM ML Practical
13 pages
DOC-20241108-WA0003
No ratings yet
DOC-20241108-WA0003
16 pages
ML Lab Manual
No ratings yet
ML Lab Manual
24 pages
Assignment No 2 AI
No ratings yet
Assignment No 2 AI
4 pages
Exercise Final
No ratings yet
Exercise Final
8 pages
MLLabManual
No ratings yet
MLLabManual
24 pages
ML Lab
No ratings yet
ML Lab
7 pages
lab manual
No ratings yet
lab manual
9 pages
16BCB0126 VL2018195002535 Pe003
No ratings yet
16BCB0126 VL2018195002535 Pe003
40 pages
Unit2 ML Programs
No ratings yet
Unit2 ML Programs
7 pages
K-Nearest Neighbor: General Gist
No ratings yet
K-Nearest Neighbor: General Gist
14 pages
Vertopal.com Lab4 KNN
No ratings yet
Vertopal.com Lab4 KNN
9 pages
Practical - 5 - 52
No ratings yet
Practical - 5 - 52
4 pages
Mlda - Lab
No ratings yet
Mlda - Lab
35 pages
DL Exp-1.4 19BCS1431
No ratings yet
DL Exp-1.4 19BCS1431
5 pages
Worksheet - 2.3 20BCS7490
No ratings yet
Worksheet - 2.3 20BCS7490
6 pages
Lab Session 9
No ratings yet
Lab Session 9
2 pages
K Nearest Neighbors
No ratings yet
K Nearest Neighbors
5 pages
Worksheet - 2.3 20BCS7611
No ratings yet
Worksheet - 2.3 20BCS7611
6 pages
Program 4
No ratings yet
Program 4
3 pages
K-Nearest Neighbor On Python Ken Ocuma
100% (2)
K-Nearest Neighbor On Python Ken Ocuma
9 pages
Part A 3. KNN Classification
No ratings yet
Part A 3. KNN Classification
35 pages
B-56 Sanket Jambhulkar MLA-7
No ratings yet
B-56 Sanket Jambhulkar MLA-7
9 pages
Machine Learning LAB
No ratings yet
Machine Learning LAB
20 pages
Lecture 12 K-Nearest Neighbors
No ratings yet
Lecture 12 K-Nearest Neighbors
24 pages
Big Data Assignment - 7
No ratings yet
Big Data Assignment - 7
7 pages
Lab 8
No ratings yet
Lab 8
7 pages
Report-0.89915600 1720543880
No ratings yet
Report-0.89915600 1720543880
3 pages
Week10 KNN Practical
No ratings yet
Week10 KNN Practical
4 pages
SVM K NN MLP With Sklearn Jupyter NoteBo
No ratings yet
SVM K NN MLP With Sklearn Jupyter NoteBo
22 pages
Lab 10- Manual and assignment on KNN
No ratings yet
Lab 10- Manual and assignment on KNN
3 pages
DSM 3
No ratings yet
DSM 3
6 pages
K-Means Clustering From Scratch
No ratings yet
K-Means Clustering From Scratch
3 pages
2 - 9 - KNN Code
No ratings yet
2 - 9 - KNN Code
6 pages
Programs Lab Bca
No ratings yet
Programs Lab Bca
16 pages
Activity 01: Python Set/s of Source Code Use in The Activity (Paste Below)
No ratings yet
Activity 01: Python Set/s of Source Code Use in The Activity (Paste Below)
2 pages
DSM 1
No ratings yet
DSM 1
6 pages
ai int-1
No ratings yet
ai int-1
6 pages
Machine Learning Algorithms From Scratch
No ratings yet
Machine Learning Algorithms From Scratch
9 pages
ML Lab Manual
No ratings yet
ML Lab Manual
12 pages
K-NN Algorithm: Need To Create Two Files File 1: KNN - Py Second File: Expt3.py
No ratings yet
K-NN Algorithm: Need To Create Two Files File 1: KNN - Py Second File: Expt3.py
4 pages
Machine Learning With Python - Machine Learning Algorithms - KNN
No ratings yet
Machine Learning With Python - Machine Learning Algorithms - KNN
15 pages
1st PGM
No ratings yet
1st PGM
10 pages
KNN Model Find Optimanl K
No ratings yet
KNN Model Find Optimanl K
3 pages
ML Notes
100% (2)
ML Notes
125 pages
lab_1_1.2
No ratings yet
lab_1_1.2
4 pages
Final ML File
No ratings yet
Final ML File
34 pages
DSM 2
No ratings yet
DSM 2
7 pages
AI_ML22203009 - Assignment-10
No ratings yet
AI_ML22203009 - Assignment-10
3 pages
Setup: This Notebook Contains All The Sample Code and Solutions To The Exercises in Chapter 3
No ratings yet
Setup: This Notebook Contains All The Sample Code and Solutions To The Exercises in Chapter 3
30 pages
To Study About Numpy, Pandas and Matplotlib Libraries in Python
No ratings yet
To Study About Numpy, Pandas and Matplotlib Libraries in Python
21 pages
mnbnmnbnnmbbhhuyrgh
No ratings yet
mnbnmnbnnmbbhhuyrgh
3 pages
data preprocessing
No ratings yet
data preprocessing
9 pages
Assignment 5
No ratings yet
Assignment 5
5 pages
KMEANS
No ratings yet
KMEANS
9 pages
EE 559 HW2Code PDF
No ratings yet
EE 559 HW2Code PDF
7 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Difference Between PM and HRM
No ratings yet
Difference Between PM and HRM
3 pages
Ankush Gawri: Personal Statement
No ratings yet
Ankush Gawri: Personal Statement
4 pages
Battlefield Emotions 1500-1800: Practices, Experience, Imagination 1st Edition Erika Kuijpers 2024 Scribd Download
100% (2)
Battlefield Emotions 1500-1800: Practices, Experience, Imagination 1st Edition Erika Kuijpers 2024 Scribd Download
55 pages
Intro To Workbench
0% (1)
Intro To Workbench
11 pages
Diagrama Ternário
No ratings yet
Diagrama Ternário
27 pages
GS134 9
100% (2)
GS134 9
31 pages
Functions of Dance
No ratings yet
Functions of Dance
3 pages
Sare
No ratings yet
Sare
7 pages
Kyocera FS C5020 30N
No ratings yet
Kyocera FS C5020 30N
216 pages
HSMG Temperature 2009
No ratings yet
HSMG Temperature 2009
79 pages
Week 10
No ratings yet
Week 10
5 pages
Diploma Mechanical E-Learning: Ch-1 System Concept 2-Marks Question
No ratings yet
Diploma Mechanical E-Learning: Ch-1 System Concept 2-Marks Question
20 pages
(Ebook) The Cambridge Guide to Pedagogy and Practice in Second Language Teaching (The Cambridge Guides Series) by Anne Burns, Jack C. Richards all chapter instant download
No ratings yet
(Ebook) The Cambridge Guide to Pedagogy and Practice in Second Language Teaching (The Cambridge Guides Series) by Anne Burns, Jack C. Richards all chapter instant download
76 pages
Case Study: Cipherlab Customer Spotlight: Formosa Plastics Group
No ratings yet
Case Study: Cipherlab Customer Spotlight: Formosa Plastics Group
2 pages
Ssc Equivalent Exam Subject Wise Detailed Marks Distribution Active From 2017
No ratings yet
Ssc Equivalent Exam Subject Wise Detailed Marks Distribution Active From 2017
11 pages
978 2529 1 SM
No ratings yet
978 2529 1 SM
7 pages
Compoundwall
No ratings yet
Compoundwall
2 pages
RX 2740 HF
No ratings yet
RX 2740 HF
3 pages
Minutes of The Meeting-School
No ratings yet
Minutes of The Meeting-School
4 pages
Tender No. 3300005682: Earth Work, Excavation and Backfilling
No ratings yet
Tender No. 3300005682: Earth Work, Excavation and Backfilling
16 pages
E.D Lab Report 2
No ratings yet
E.D Lab Report 2
10 pages
Navigate Upper-Intermediate Wordlist Unit 12 PDF
No ratings yet
Navigate Upper-Intermediate Wordlist Unit 12 PDF
2 pages
Guia
No ratings yet
Guia
9 pages
Design and Development of Cost-Effective Solar Rechargeable Led Lantern
No ratings yet
Design and Development of Cost-Effective Solar Rechargeable Led Lantern
3 pages
Attachment
No ratings yet
Attachment
1 page
Manual Shifting Test: Automatic Transmission
No ratings yet
Manual Shifting Test: Automatic Transmission
2 pages
Complete Download Ratio and Voluntas The Tension Between Reason and Will in Law 1st Edition Kaarlo Tuori PDF All Chapters
100% (1)
Complete Download Ratio and Voluntas The Tension Between Reason and Will in Law 1st Edition Kaarlo Tuori PDF All Chapters
61 pages

Implementing KNN Algorithm on the Iris Dataset

Uploaded by

Implementing KNN Algorithm on the Iris Dataset

Uploaded by

John Ndungu / Implementing KNN Algorithm on the Iris Dataset

import matplotlib.pyplot as plt

from sklearn import datasets

from collections import Counter

#EDA on Iris Dataset

# import iris dataset

0 5.1 3.5 1.4 0.2 0.0

1 4.9 3.0 1.4 0.2 0.0

2 4.7 3.2 1.3 0.2 0.0

3 4.6 3.1 1.5 0.2 0.0

4 5.0 3.6 1.4 0.2 0.0

#Describe the Dataset

count 150.000000 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.057333 3.758000 1.199333 1.000000

std 0.828066 0.435866 1.765298 0.762238 0.819232

min 4.300000 2.000000 1.000000 0.100000 0.000000

25% 5.100000 2.800000 1.600000 0.300000 0.000000

50% 5.800000 3.000000 4.350000 1.300000 1.000000

75% 6.400000 3.300000 5.100000 1.800000 2.000000

max 7.900000 4.400000 6.900000 2.500000 2.000000

#Split into X and Y

#Split into training and testing

# split the data into train and test sets

training set size: 120 samples

#Normalize the Dataset

scaler= Normalizer().fit(x_train) # the scaler is fitted to the training set

x train before Normalization

x train after Normalization

#Visualize the Dataset before and after Normalization

before= sns.pairplot(iris_df.replace({'target': di}), hue= 'target')

Text(0.5, 1.08, 'Pair Plot of the dataset After normalization')

def distance_ecu(x_train, x_test_point):

for col in range(len(current_train_point)): ## Loop over the columns of the row

current_distance += (current_train_point[col] - x_test_point[col]) **2

distances.append(current_distance) ## Append the distances

# Store distances in a dataframe

#KNN Step 2 (Find the nearest neighbors)

# Sort values using the sort_values function

## Take only the first K neighbors

#KNN Step 3 (Classify the point based on a majority vote)

def voting(df_nearest, y_train):

y_pred= counter_vote.most_common()[0][0] # Majority Voting

#KNN Full Algorithm: Putting Everything Together

def KNN_from_scratch(x_train, y_train, x_test, K):

#Test the KNN Algorithm on the test dataset

#Check if the output is exactly the same

#Calculate the accuracy of both methods

print(f'The accuracy of our implementation is {accuracy_score(y_test, y_pred_scratch)}')

The accuracy of our implementation is 0.9666666666666667

#Perform Hyper-parameter Tuning using K-fold Cross Validation

n_splits= 4 ## Choose the number of splits

accuracy_k= [] ## Keep track of the accuracy for each K

for k in k_values: ## Loop over the K values

accuracy_fold+= accuracy_score (y_pred_fold, y_valid_fold) ## Accumulate the accuracy

print(f'Best accuracy was {np.max(accuracy_k)}, which corresponds to a value of K= {k_values[np.argmax(accuracy_k)]}')

Best accuracy was 0.9666666666666668, which corresponds to a value of K= 1

You might also like