0% found this document useful (0 votes)
211 views23 pages

Machine Learning Lab Record: Dr. Sarika Hegde

The document describes code submissions for a machine learning lab record. It includes 5 programs: 1) Implementing the FIND-S algorithm to find the most specific hypothesis from training data. 2) Implementing the Candidate-Elimination algorithm to output hypotheses consistent with training examples. 3) Demonstrating the ID3 decision tree algorithm. 4) Implementing backpropagation for an artificial neural network. 5) Implementing naive Bayesian classification and computing accuracy on test data.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
211 views23 pages

Machine Learning Lab Record: Dr. Sarika Hegde

The document describes code submissions for a machine learning lab record. It includes 5 programs: 1) Implementing the FIND-S algorithm to find the most specific hypothesis from training data. 2) Implementing the Candidate-Elimination algorithm to output hypotheses consistent with training examples. 3) Demonstrating the ID3 decision tree algorithm. 4) Implementing backpropagation for an artificial neural network. 5) Implementing naive Bayesian classification and computing accuracy on test data.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 23

MACHINE LEARNING LAB

RECORD

Submitted To:

Dr. Sarika Hegde

Submitted By:

Sachin Singh
4NM18CS142
VI sem
Sec: ‘C’
Program 1:
Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesis based on a given set of training data samples. Read the training data
from a .CSV file.
DATASET:

CODE:

import csv
a= []
with open('./enjoysport.csv','r') as csvfile:
for row in csv.reader(csvfile):
a.append(row)

print(a)

print("\n\n the total number of training instances",len(a))


num_attribute=len(a[0])-1
print("\n the intitial hypothesis is:")
hypothesis=['0']*num_attribute
print(hypothesis)
for i in range(0,len(a)):
if a[i][num_attribute] == 'yes':
for j in range(0, num_attribute):
if hypothesis[j] == '0' or hypothesis[j]==a[i][j]:
hypothesis[j]=a[i][j]
else:
hypothesis[j]='?'
print("\n the hypothesis for training instance is \n".format(i+1),hypothesis)
print("\n the maximally specific hypothesis for training instance is ")
print(hypothesis)

OUTPUT:

[['sky ', 'airtemp', 'humidity', 'wind', 'water', 'forecast', 'enjysport'], ['sunny', 'warm', 'normal',
'strong', 'warm', 'same', 'yes'], ['sunny', 'warm', 'high', 'strong', 'warm', 'same', 'yes'], ['rainy',
'cold', 'high', 'strong', 'warm', 'change', 'no'], ['sunny', 'warm', 'high', 'strong', 'cold', 'change',
'yes']]

Program 2:

For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of
the set of all hypotheses consistent with the training examples.

DATASET:

CODE:
import numpy as np
import pandas as pd

data = pd.DataFrame(data=pd.read_csv('enjoysport.csv'))
concepts=np.array(data.iloc[:,0:-1])
print("\nInstances are \n",concepts)
target=np.array(data.iloc[:,-1])
print("\ntarget values:",target)

def learn (concepts, target):


specific_h = concepts[0].copy()
print("Initialisation of speicif_h and general_h")
print("specific boundary: ", specific_h)
general_h=[["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
print("generic boundary ",general_h)

for i,h in enumerate(concepts):


print("\nInstance", i+1, " is ",h)
if target[i]== "yes":
print("Instance is positive")
for x in range(len(specific_h)):
if h[x]!=specific_h[x]:
specific_h[x]='?'
general_h[x][x]='?'

if target[i] == "no":
print("Instance is negative")
for x in range(len(specific_h)):
if h[x]!=specific_h[x]:
general_h[x][x]=specific_h[x]
else:
general_h[x][x]='?'

print("Specific boundary after ", i+1, "instance is ", specific_h)


print("generic boundary after ", i+1, "Instance is ",general_h)
print("\n")

indices = [i for i, val in enumerate(general_h) if val == ['?','?','?','?','?','?']]


for i in indices:
general_h.remove(['?','?','?','?','?','?'])
return specific_h,general_h

s_final, g_final=learn(concepts,target)

print("final specific ", s_final, sep="\n")


print("final general", g_final, sep="\n")
OUTPUT:

Program 3:

Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and apply
this knowledge to classify a new sample

DATASET:
CODE:
import pandas as pd
import numpy as np
dataset=pd.read_csv('./playtennis.csv')
print(dataset)

def entropy(target_col):
elements,counts = np.unique(target_col,return_counts = True)
# print(elements,counts)
entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in
range(len(elements))])
#print(entropy)
return entropy

def InfoGain(data,split_attribute_name,target_name="PlayTennis"):
total_entropy = entropy(data[target_name])
vals,counts= np.unique(data[split_attribute_name],return_counts=True)
Weighted_Entropy =
np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).
dropna()[target_name]) for i in range(len(vals))])
Information_Gain = total_entropy - Weighted_Entropy
return Information_Gain

def ID3(data,originaldata,features,target_attribute_name="PlayTennis",parent_node_class =
None):
if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return
np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attri
bute_name],return_counts=True)[1])]
elif len(features) ==0:
return parent_node_class
else:
parent_node_class

np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],re
turn_counts=True)[1])]
item_values = [InfoGain(data,feature,target_attribute_name)
for feature in features] #Return the information gain values for the features in the
dataset
best_feature_index = np.argmax(item_values)
best_feature = features[best_feature_index]
tree = {best_feature:{}}
features = [i for i in features if i != best_feature]
for value in np.unique(data[best_feature]):
value = value
sub_data = data.where(data[best_feature] == value).dropna()
subtree =
ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
tree[best_feature][value] = subtree
return(tree)

tree = ID3(dataset,dataset,dataset.columns[:-1])
print(' \nDisplay Tree\n', tree)

OUTPUT:

Display Tree {'outlook': {'overcast': 'yes', 'rainy': {'wind': {'strong': 'no', 'weak': 'yes'}}, 'sunny':
{'temperature': {'cool': 'yes', 'hot': 'no', 'mild': {'wind': {'strong': 'yes', 'weak': 'no'}}}}}}
Program 4:

Build an Artificial Neural Network by implementing the Backpropagation


algorithm and test the same using appropriate data sets.

CODE:
import numpy as np

X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)


y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0) #maximum of X array longitudinally
y = y/100

#Sigmoid Function
def sigmoid (x):
return 1/(1 + np.exp(-x))

#Derivative of Sigmoid Function


def derivatives_sigmoid(x):
return x * (1 - x)

#Variable initialization
epoch=5 #Setting training iterations
lr=0.1 #Setting learning rate

inputlayer_neurons = 2 #number of features in data set


hiddenlayer_neurons = 3 #number of hidden layers neurons
output_neurons = 1 #number of neurons at output layer
#weight and bias initialization

wh = np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh = np.random.uniform(size=(1,hiddenlayer_neurons))
wout = np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout = np.random.uniform(size=(1,output_neurons))

#draws a random range of numbers uniformly of dim x*y


for i in range(epoch):
#Forward Propogation
hinp1=np.dot(X,wh)
hinp=hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+bout
output = sigmoid(outinp)

#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO * outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act)#how much hidden layer wts contributed to
error
d_hiddenlayer = EH * hiddengrad

wout += hlayer_act.T.dot(d_output) *lr # dotproduct of nextlayererror and currentlayerop


wh += X.T.dot(d_hiddenlayer) *lr

print ("-----------Epoch-", i+1, "Starts----------")


print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)
print ("-----------Epoch-", i+1, "Ends----------\n")

print("Input: \n" + str(X))


print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)

OUTPUT:
Program 5:

Write a program to implement the naïve Bayesian classifier for a sample


training data set stored as a .CSV file. Compute the accuracy of the classifier,
considering few test data sets.

DATASET:

…………..

CODE:
import csv
import random
import math

def loadcsv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
for i in range(len(dataset)):
#converting strings into numbers for processing
dataset[i] = [x.replace(' ','') for x in dataset[i]]
dataset[i] = [float(x) for x in dataset[i]]

return dataset

def splitdataset(dataset, splitratio):


#67% training size
trainsize = int(len(dataset) * splitratio);
trainset = []
copy = list(dataset);
while len(trainset) < trainsize:
#generate indices for the dataset list randomly to pick ele for training data
index = random.randrange(len(copy));
trainset.append(copy.pop(index))
return [trainset, copy]

def separatebyclass(dataset):
separated = {} #dictionary of classes 1 and 0
#creates a dictionary of classes 1 and 0 where the values are
#the instances belonging to each class
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated

def mean(numbers):
return sum(numbers)/float(len(numbers))

def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)

def summarize(dataset): #creates a dictionary of classes


summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];
del summaries[-1] #excluding labels +ve or -ve
return summaries

def summarizebyclass(dataset):
separated = separatebyclass(dataset);
#print(separated)
summaries = {}
for classvalue, instances in separated.items():
#for key,value in dic.items()
#summaries is a dic of tuples(mean,std) for each class value
summaries[classvalue] = summarize(instances)
#summarize is used to cal to mean and std
return summaries

def calculateprobability(x, mean, stdev):


exponent = math.exp(-(math.pow(x-mean,2)/ (2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateclassprobabilities(summaries, inputvector):


# probabilities contains the all prob of all class of test data
probabilities = {}
for classvalue, classsummaries in summaries.items():
#class and attribute information as mean and sd
probabilities[classvalue] = 1
for i in range(len(classsummaries)):
mean, stdev = classsummaries[i] #take mean and sd of every attribute for class 0 and
1 seperaely
x = inputvector[i] #testvector's first attribute
probabilities[classvalue] *= calculateprobability(x, mean, stdev);#use normal dist
return probabilities

def predict(summaries, inputvector): #training and test data is passed


probabilities = calculateclassprobabilities(summaries, inputvector)
bestLabel, bestProb = None, -1
for classvalue, probability in probabilities.items():
#assigns that class which has the highest prob
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classvalue
return bestLabel

def getpredictions(summaries, testset):


predictions = []
for i in range(len(testset)):
result = predict(summaries, testset[i])
predictions.append(result)
return predictions

def getaccuracy(testset, predictions):


correct = 0
for i in range(len(testset)):
if testset[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testset))) * 100.0
def main():
filename = 'pima_indian.csv'
splitratio = 0.67
dataset = loadcsv(filename);

trainingset, testset = splitdataset(dataset, splitratio)


print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingset),
len(testset)))
# prepare model
summaries = summarizebyclass(trainingset);
#print(summaries)
# test model
predictions = getpredictions(summaries, testset) #find the predictions of test data with the
training data
accuracy = getaccuracy(testset, predictions)
print('Accuracy of the classifier is : {0}%'.format(accuracy))

main()

OUTPUT:

Program 6:

Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data set for clustering using k-Means algorithm. Compare the results of these
two algorithms and comment on the quality of clustering. You can add
Java/Python ML library classes/API in the program.

DATASET:
CODE:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
import pandas as pd
import numpy as np

iris = datasets.load_iris()

X = pd.DataFrame(iris.data)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']

y = pd.DataFrame(iris.target)
y.columns = ['Targets']

model = KMeans(n_clusters=3)
model.fit(X)

plt.figure(figsize=(14,7))

colormap = np.array(['red', 'lime', 'black'])

# Plot the Original Classifications


plt.subplot(1, 3, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Real Classification')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

# Plot the Models Classifications


plt.subplot(1, 3, 2)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_], s=40)
plt.title('K Mean Classification')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
print('The accuracy score of K-Mean: ',sm.accuracy_score(y, model.labels_))
print('The Confusion matrixof K-Mean: ',sm.confusion_matrix(y, model.labels_))

from sklearn import preprocessing


scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
#xs.sample(5)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)

y_gmm = gmm.predict(xs)
#y_cluster_gmm

plt.subplot(1, 3, 3)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y_gmm], s=40)
plt.title('GMM Classification')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

print('The accuracy score of EM: ',sm.accuracy_score(y, y_gmm))


print('The Confusion matrix of EM: ',sm.confusion_matrix(y, y_gmm))

OUTPUT:
Program 7:

Write a program to implement k-Nearest Neighbour algorithm to classify the


iris data set.

DATASET:

CODE:
# k-nearest neighbors on the Iris Flowers Dataset
#sepal_length sepal_width petal_lenth pedal_width specices
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import pandas as pd
import csv

#Load a CSV file


def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
#dataset=pd.read_csv('irisdata.csv')
dataset=load_csv('iris.data')
print(dataset,end="\n\n")
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())

# Convert string column to integer


def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup

# Find the min and max values for each column


def dataset_minmax(dataset):
minmax = list()
for i in range(len(dataset[0])):
col_values = [row[i] for row in dataset]
value_min = min(col_values)
value_max = max(col_values)
minmax.append([value_min, value_max])
return minmax

# Rescale dataset columns to the range 0-1


def normalize_dataset(dataset, minmax):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds


def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for _ in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split

# Calculate accuracy percentage


def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split


def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores

# Calculate the Euclidean distance between two vectors


def euclidean_distance(row1, row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2
return sqrt(distance)

# Locate the most similar neighbors


def get_neighbors(train, test_row, num_neighbors):
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])
neighbors = list()
for i in range(num_neighbors):
neighbors.append(distances[i][0])
return neighbors
# Make a prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
neighbors = get_neighbors(train, test_row, num_neighbors)
output_values = [row[-1] for row in neighbors]
prediction = max(set(output_values), key=output_values.count)
return prediction

# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
predictions = list()
for row in test:
output = predict_classification(train, row, num_neighbors)
predictions.append(output)
return(predictions)

# Test the kNN on the Iris Flowers dataset


seed(1)

for i in range(len(dataset[0])-1):
str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))),end="\n\n")

O UTPUT:
Program 8:
Implement the non-parametric Locally Weighted Regression algorithm in order
to fit data points. Select appropriate data set for your experiment and draw
graphs

DATASET:

CODE:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def kernel(point, xmat, k):


m,n = np.shape(xmat)
weights = np.mat(np.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np.exp(diff*diff.T/(-2.0*k**2))
return weights

def localWeight(point, xmat, ymat, k):


wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W

def localWeightRegression(xmat, ymat, k):


m,n = np.shape(xmat)
ypred = np.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred

# load data points


data = pd.read_csv('10-dataset.csv')
bill = np.array(data.total_bill)
tip = np.array(data.tip)

#preparing and add 1 in bill


mbill = np.mat(bill)
mtip = np.mat(tip)

m= np.shape(mbill)[1]
one = np.mat(np.ones(m))
X = np.hstack((one.T,mbill.T))

#set k here
ypred = localWeightRegression(X,mtip,0.5)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();

OUTPUT:

You might also like