Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
in)
Roll No: 2015-50-831
hammingDistance
Kfoldvalidation
knnWithNfolds
Knn
plotThem
test
test_1,test_2,test_3,test_4
Url
Features
Instances
Number of Classes
Best mean Accuracy
Distance Function
Recommend K value (from observations)
http://archive.ics.uci.edu/ml/machine-learningdatabases/breast-cancer-wisconsin/breastcancer-wisconsin.data
9 (All numerical between 1 10)
699 (Some features missing)
2
0.97
Euclidian Distance
3
http://archive.ics.uci.edu/ml/machine-learningdatabases/iris/iris.data
4 (All numerical)
150 (No missing features)
3
0.92
Euclidian Distance
2
http://archive.ics.uci.edu/ml/machine-learningdatabases/wine/wine.data
13 (All numerical)
178 (No missing features)
3
0.74
Euclidian Distance
4
http://archive.ics.uci.edu/ml/machine-learningdatabases/balance-scale/balance-scale.data
4 (Categorical)
625 (No missing features)
3
0.78
Euclidian Distance and Hamming distance
2
Since the features are categorical, I have initially used hamming distance. However hamming
distance returned bad mean accuracy. Running the knn with Euclidian distance returned much
better accuracy.
6. Code
"""
Created on Tue Aug 18 21:30:26 2015
@author: Srikanth Mujjiga ([email protected])
"""
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
os.chdir(r'G:\smujjiga\SM in AI\sminai')
def euclidianDistance(x1,x2):
return np.linalg.norm(x1-x2)
#closer points are near to 0
def hammingDistance(x1,x2):
return sum([0 if a == b else 1 for a,b in zip(x1,x2)])
#n: Observation d: Features
#trainX = (nXd) trainY = (nX1) predictx = (1Xd)
def knn(trainX,trainY,predictX,k=1,df='ed'):
distance = 0
predictx = np.array(predictX[0:1].values[0],dtype=long)
#Distance function: Hamming Distance
if df == 'hd':
distance = trainX.apply(lambda x: hammingDistance(np.array(x,dtype=long),
predictx),axis=1)
#default: euclidianDistance
else:
distance = trainX.apply(lambda x:
euclidianDistance(np.array(x,dtype=long),predictx),axis=1)
#find nearest K Neighbours
knearest = sorted(zip(distance,trainY.iloc[:,0]))[:k]
#find the top class
df = pd.DataFrame(knearest,columns=['distance','class'])
#Groud by class and return class label of largest group
return df['class'].value_counts().index.get_values()[0]
#return training folds indexs, validation fold index
def kfoldvalidation(observations,folds):
foldSize = observations/folds
allObservationIndices = np.arange(0,observations)
#return (fold-1) indexes
for i in xrange(0,foldSize*(folds-1),foldSize):
validationIndices = np.arange(i,i+foldSize)
trainingIndices = list(set(allObservationIndices) - set(validationIndices))
yield trainingIndices, validationIndices
#last fold might have more thean foldSize indexes if obeservations is not exact
multiple of folds
validationIndices = np.arange(foldSize*(folds-1),observations)
trainingIndices = list(set(allObservationIndices) - set(validationIndices))
yield trainingIndices,validationIndices
def knnWithNfolds(observationsX,observationsY,k,folds,df):
totalObservations = len(observationsX)
print "# observations:{0}, K: {1}, Folds: {2}".format(totalObservations,k,folds)
currentFold = 0
predictions = []
for trainFold,validationFold in kfoldvalidation(totalObservations,folds):
currentFold += 1
print "knn of fold:{0} with k:{1}".format(currentFold,k)
sys.stdout.flush()
trainX = observationsX.ix[trainFold]
trainY = observationsY.ix[trainFold]
testX = observationsX.ix[validationFold]
testY = observationsY.ix[validationFold]
predictedY = []
#for each test sample get the predicted class (y)
for i in xrange(0,len(testX)):
predictedY.append(knn(trainX,trainY,testX[i:i+1],k,df))
#Find acurrecy 1: Correctly Prdicted, 0 for wrong prediction
#print predictedY,testY.iloc[:,0].tolist()
predictions.extend([1 if a == b else 0 for a,b in
zip(predictedY,testY.iloc[:,0].tolist())])
meanAccuracy = np.mean(predictions)
stdofAccuracy = np.std(predictions)
return meanAccuracy,stdofAccuracy
#return predictions
def test(trainX,trainY,df):
ma = {}
st = {}
for folds in xrange(2,6):
meanAccuracyPerK = []
stdPerK = []
for k in xrange(1,6):
meanAccuracy,stdofAccuracy = knnWithNfolds(trainX,trainY,k,folds,df)
meanAccuracyPerK.append(meanAccuracy)
stdPerK.append(stdofAccuracy)
ma[folds] = meanAccuracyPerK
st[folds] = stdPerK
return ma, st
def test_1():
data = pd.read_csv('datasets/breast-cancer-wisconsin.data', header=None)
data = data.reindex(np.random.permutation(data.index))
#clean data
data.replace('?',np.nan,inplace=True)
data.dropna(inplace=True)
trainX = data[[1,2,3,4,5,6,7,8,9]]
trainY = data[[10]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'ed')
def test_2():
data = pd.read_csv('datasets/iris.data', header=None)
labels = { "Iris-setosa" : 1,
"Iris-versicolor" : 2,
"Iris-virginica" : 3 }
data = data.reindex(np.random.permutation(data.index))
data = data.reindex(np.random.permutation(data.index))
data = data.reindex(np.random.permutation(data.index))
data[5] = [labels[i] for i in data[4]]
trainX = data[[0,1,2,3]]
trainY = data[[5]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'ed')
def test_3():
df = pd.read_csv('datasets/wine.data', header=None)
data = df
df = data
data = data.reindex(np.random.permutation(data.index))
trainX = data[list(np.arange(1,14))]
trainY = data[[0]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'ed')
def test_4():
data = pd.read_csv('datasets/balance-scale.data', header=None)
data = data.reindex(np.random.permutation(data.index))
trainX = data[[1,2,3,4]]
trainY = data[[0]]
trainX.reset_index(drop=True,inplace=True)
trainY.reset_index(drop=True,inplace=True)
return test(trainX,trainY,'hd')
def plotThem(median,std,title):
f, ax = plt.subplots(2, 2)
#plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=.3, hspace=.3)
plt.suptitle(title)
x = np.arange(1,len(median[2])+1)
ax[0, 0].errorbar(x,median[2],yerr=std[2],fmt='-o',clip_on=False)
ax[0, 0].set_title('2 Fold')
ax[0, 0].margins(0.1,0.1)
ax[0, 1].errorbar(x,median[3],yerr=std[3],fmt='-o')
ax[0, 1].set_title('3 Fold')
ax[0, 1].margins(0.1,0.1)
ax[1, 0].errorbar(x,median[4],yerr=std[4],fmt='-o')
ax[1, 0].set_title('4 Fold')
ax[1, 0].margins(0.1,0.1)
ax[1, 1].errorbar(x,median[5],yerr=std[5],fmt='-o')
ax[1, 1].set_title('5 Fold')
ax[1, 1].margins(0.1,0.1)
plt.setp([a.get_xticklabels() for a in ax[0, :]], visible=False)
plt.setp([a.get_yticklabels() for a in ax[:, 1]], visible=False)
m1,
m2,
m3,
m4,
s1
s2
s3
s4
=
=
=
=
test_1()
test_2()
test_3()
test_4()
plotThem(m1,s1,"breast-cancer-wisconsin")
plotThem(m2,s2,"iris data")
plotThem(m3,s3,"wine.data")
plotThem(m4,s4,"balance scale data")