Implementing KNN Algorithm on the Iris Dataset
Implementing KNN Algorithm on the Iris Dataset
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]
x.head()
y.head()
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
Name: target, dtype: float64
x_test= np.asarray(x_test)
y_test= np.asarray(y_test)
print(f'training set size: {x_train.shape[0]} samples \ntest set size: {x_test.shape[0]} samples')
## After
iris_df_2= pd.DataFrame(data= np.c_[normalized_x_train, y_train],
columns= iris['feature_names'] + ['target'])
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0: 'Virginica'}
after= sns.pairplot(iris_df_2.replace({'target':di}), hue= 'target')
after.fig.suptitle('Pair Plot of the dataset After normalization', y=1.08)
Output:
-distances: The distances between the test point and each point in the training data.
"""
distances= [] ## create empty list called distances
for row in range(len(x_train)): ## Loop over the rows of x_train
current_train_point= x_train[row] #Get them point by point
current_distance= 0 ## initialize the distance by zero
Output:
-df_nearest: the nearest K neighbors between the test point and the training data.
"""
Output:
-y_pred: the prediction based on Majority Voting
"""
## Use the Counter Object to get the labels with K nearest neighbors.
counter_vote= Counter(y_train[df_nearest.index])
return y_pred
"""
Input:
-x_train: the full training dataset
-y_train: the labels of the training dataset
-x_test: the full test dataset
-K: the number of neighbors
Output:
-y_pred: the prediction for the whole test set based on Majority Voting.
"""
y_pred=[]
## Loop over all the test set and perform the three steps
for x_test_point in x_test:
distance_point = distance_ecu(x_train, x_test_point) ## Step 1
df_nearest_point= nearest_neighbors(distance_point, K) ## Step 2
y_pred_point = voting(df_nearest_point, y_train) ## Step 3
y_pred.append(y_pred_point)
return y_pred
K=3
y_pred_scratch= KNN_from_scratch(normalized_x_train, y_train, normalized_x_test, K)
print(y_pred_scratch)
[2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0,
#Compare our implementation with Sklearn library
knn=KNeighborsClassifier(K)
knn.fit(normalized_x_train, y_train)
y_pred_sklearn= knn.predict(normalized_x_test)
print(y_pred_sklearn)
[2. 1. 0. 2. 0. 2. 0. 1. 1. 1. 2. 1. 1. 1. 1. 0. 1. 2. 0. 0. 2. 1. 0. 0.
2. 0. 0. 1. 1. 0.]
print(np.array_equal(y_pred_sklearn, y_pred_scratch))
True
normalized_x_test_fold= normalized_x_train[normalized_x_valid_fold_idx]
y_valid_fold= y_train[normalized_x_valid_fold_idx]
y_pred_fold= KNN_from_scratch(normalized_x_train_fold, y_train_fold, normalized_x_test_fold, k)
print(f'The accuracy for each K value was {list ( zip (accuracy_k, k_values))}') ## creates a tuple with accuracy corresp
The accuracy for each K value was [(0.9666666666666668, 1), (0.9666666666666668, 3), (0.9666666666666668, 5), (0.9666666666666668, 7), (0.958