15CSL76 Students
15CSL76 Students
15CSL76 Students
1. Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesis based on a given set of training data samples. Read the training data from a
.CSV file.
Program:
import csv
with open('tennis.csv', 'r') as f:
reader = csv.reader(f)
your_list = list(reader)
for i in your_list:
print(i)
if i[-1] == "True":
j=0
for x in i:
if x != "True":
if x != h[0][j] and h[0][j] == '0':
h[0][j] = x
elif x != h[0][j] and h[0][j] != '0':
h[0][j] = '?'
else:
pass
j=j+1
print("Most specific hypothesis is")
print(h)
Output
Page 1
Date: --/ -- /----
2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the set of all
hypotheses consistent with the training examples.
Program:
import csv
def get_domains(examples):
d = [set() for i in examples[0]]
for x in examples:
for i, xi in enumerate(x):
d[i].add(xi)
return [list(sorted(x)) for x in d]
Page 2
Date: --/ -- /----
any([more_general(g,h)
for g in G])])
S.difference_update([h for h in S if
any([more_general(h, h1)
for h1 in S if h != h1])])
return S
G.difference_update([h for h in G if
any([more_general(g1, h)
for g1 in G if h != g1])])
return G
def candidate_elimination(examples):
domains = get_domains(examples)[:-1]
n = len(domains)
G = set([("?",)*n])
S = set([("0",)*n])
i=0
print("\nS[0]:",str(S),"\nG[0]:",str(G))
for xcx in examples:
i=i+1
x, cx = xcx[:-1], xcx[-1]
if cx=='Y':
G = {g for g in G if fulfills(x, g)}
S = generalize_S(x, G, S)
else:
S = {s for s in S if not fulfills(x, s)}
G = specialize_G(x, domains, G, S)
print("\nS[{0}]:".format(i),S)
print("G[{0}]:".format(i),G)
return
Page 3
Date: --/ -- /----
candidate_elimination(examples)
Output:
Page 4
Date: --/ -- /----
3. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge to
classify a new sample.
Program:
import math
import csv
def load_csv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset, headers
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def entropy(S):
attr = list(set(S))
if len(attr) == 1: #if all are +ve/-ve then entropy = 0
return 0
counts = [0,0] # Only two values possible 'yes' or 'no'
for i in range(2):
counts[i] = sum( [1 for x in S if attr[i] == x] ) / (len(S) * 1.0)
sums = 0
for cnt in counts:
sums += -1 * cnt * math.log(cnt, 2)
return sums
Page 5
Date: --/ -- /----
def classify(node,x_test,features):
if node.answer != "":
print(node.answer)
return
pos = features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)
Page 6
Date: --/ -- /----
Output:
Page 7
Date: --/ -- /----
def sigmoid_grad(x):
return x * (1 - x)
epoch=1000
eta =0.2
input_neurons = 2
hidden_neurons = 3
output_neurons = 1
wh=np.random.uniform(size=(input_neurons,hidden_neurons))
bh=np.random.uniform(size=(1,hidden_neurons))
wout=np.random.uniform(size=(hidden_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
h_ip=np.dot(X,wh) + bh
h_act = sigmoid(h_ip)
o_ip=np.dot(h_act,wout) + bout
output = sigmoid(o_ip)
Eo = y-output
outgrad = sigmoid_grad(output)
d_output = Eo* outgrad
Eh = d_output.dot(wout.T)
hiddengrad = sigmoid_grad(h_act)
d_hidden = Eh * hiddengrad
wout += h_act.T.dot(d_output) *eta
wh += X.T.dot(d_hidden) *eta
print("Normalized Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)
Output:
Page 8
Date: --/ -- /----
5. Write a program to implement the naïve Bayesian classifier for a sample training data
set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data
sets.
Program:
def loadCsv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
x = dataset[i]
if (x[-1] not in separated):
separated[x[-1]] = []
separated[x[-1]].append(x)
return separated
def compute_mean_std(dataset):
mean_std = [ (st.mean(attribute), st.stdev(attribute))
for attribute in zip(*dataset)];
del mean_std[-1]
return mean_std
def summarizeByClass(dataset):
separated = separateByClass(dataset);
summary = {}
for classValue, instances in separated.items():
summary[classValue] = compute_mean_std(instances)
return summary
Page 9
Date: --/ -- /----
p = {}
for classValue, classSummaries in summaries.items():
p[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = testVector[i]
p[classValue] *= estimateProbability(x, mean, stdev);
return p
dataset = loadCsv('data51.csv');
print('Pima Indian Diabetes Dataset loaded...')
print('Total instances available :',len(dataset))
print('Total attributes present :',len(dataset[0])-1)
Page 10
Date: --/ -- /----
Output:
Page 11
Date: --/ -- /----
6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier
model to perform this task. Built-in Java classes/API can be used to write the program.
Calculate the accuracy, precision, and recall for your data set.
Program:
import pandas as pd
msg=pd.read_csv('data6.csv',names=['message','label'])
print('Total instances in the dataset:',msg.shape[0])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
Y=msg.labelnum
print('\nThe message and its label of first 5 instances are listed
below')
X5, Y5 = X[0:5], msg.label[0:5]
for x, y in zip(X5,Y5):
print(x,',',y)
Page 12
Date: --/ -- /----
Output:
Page 13
Date: --/ -- /----
8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set
for clustering using k-Means algorithm. Compare the results of these two algorithms and
comment on the quality of clustering. You can add Java/Python ML library classes/API in
the program
Program:
model = KMeans(n_clusters=3)
model.fit(X)
plt.figure(figsize=(14,14))
colormap = np.array(['red', 'lime', 'black'])
plt.subplot(2, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Real Clusters')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.subplot(2, 2, 2)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_],
s=40)
plt.title('K-Means Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
Page 14
Date: --/ -- /----
Output:
Page 15
Date: --/ -- /----
9. Write a program to implement k-Nearest Neighbour algorithm to classify the iris data
set. Print both correct and wrong predictions. Java/Python ML library classes can be used
for this problem.
Program:
Output:
Page 16
Date: --/ -- /----
10. Implement the non-parametric Locally Weighted Regression algorithm in order to fit
data points. Select appropriate data set for your experiment and draw graphs.
Program:
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
data = pd.read_csv('data10.csv')
bill = np1.array(data.total_bill)
tip = np1.array(data.tip)
mbill = np1.mat(bill)
mtip = np1.mat(tip)
m= np1.shape(mbill)[1]
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T))
#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();
Page 17
Date: --/ -- /----
Output:
Page 18