15CSL76 Students

Date: --/ -- /----
1. Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesis based on a given set of training data samples. Read the training data from a
.CSV file.
Program:
import csv
with open('tennis.csv', 'r') as f:
reader = csv.reader(f)
your_list = list(reader)
h = [['0', '0', '0', '0', '0', '0']]
for i in your_list:
print(i)
if i[-1] == "True":
j=0
for x in i:
if x != "True":
if x != h[0][j] and h[0][j] == '0':
h[0][j] = x
elif x != h[0][j] and h[0][j] != '0':
h[0][j] = '?'
else:
pass
j=j+1
print("Most specific hypothesis is")
print(h)
Output
'Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same',True

'Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same',True
'Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change',False
'Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change',True
Maximally Specific set

[['Sunny', 'Warm', '?', 'Strong', '?', '?']]
Page 1
Date: --/ -- /----
2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the set of all
hypotheses consistent with the training examples.
Program:
import csv
def get_domains(examples):
d = [set() for i in examples[0]]
for x in examples:
for i, xi in enumerate(x):
d[i].add(xi)
return [list(sorted(x)) for x in d]
def more_general(h1, h2):

more_general_parts = []
for x, y in zip(h1, h2):
mg = x == "?" or (x != "0" and (x == y or y == "0"))
more_general_parts.append(mg)
return all(more_general_parts)
def fulfills(example, hypothesis):

return more_general(hypothesis, example)
def min_generalizations(h, x):

h_new = list(h)
for i in range(len(h)):
if not fulfills(x[i:i+1], h[i:i+1]):
h_new[i] = '?' if h[i] != '0' else x[i]
return [tuple(h_new)]
def min_specializations(h, domains, x):

results = []
for i in range(len(h)):
if h[i] == "?":
for val in domains[i]:
if x[i] != val:
h_new = h[:i] + (val,) + h[i+1:]
results.append(h_new)
elif h[i] != "0":
h_new = h[:i] + ('0',) + h[i+1:]
results.append(h_new)
return results
def generalize_S(x, G, S):

S_prev = list(S)
for s in S_prev:
if s not in S:
continue
if not fulfills(x, s):
S.remove(s)
Splus = min_generalizations(s, x)
S.update([h for h in Splus if
Page 2
Date: --/ -- /----
any([more_general(g,h)
for g in G])])
S.difference_update([h for h in S if
any([more_general(h, h1)
for h1 in S if h != h1])])
return S
def specialize_G(x, domains, G, S):

G_prev = list(G)
for g in G_prev:
if g not in G:
continue
if fulfills(x, g):
G.remove(g)
Gminus = min_specializations(g, domains, x)
G.update([h for h in Gminus if any([more_general(h, s)

for s in S])])
G.difference_update([h for h in G if
any([more_general(g1, h)
for g1 in G if h != g1])])
return G
def candidate_elimination(examples):
domains = get_domains(examples)[:-1]
n = len(domains)
G = set([("?",)*n])
S = set([("0",)*n])
print("Maximally specific hypotheses - S ")

print("Maximally general hypotheses - G ")
i=0
print("\nS[0]:",str(S),"\nG[0]:",str(G))
for xcx in examples:
i=i+1
x, cx = xcx[:-1], xcx[-1]
if cx=='Y':
G = {g for g in G if fulfills(x, g)}
S = generalize_S(x, G, S)
else:
S = {s for s in S if not fulfills(x, s)}
G = specialize_G(x, domains, G, S)
print("\nS[{0}]:".format(i),S)
print("G[{0}]:".format(i),G)
return
Page 3
Date: --/ -- /----
with open('c1.csv') as csvFile:

examples = [tuple(line) for line in csv.reader(csvFile)]
candidate_elimination(examples)
Output:
Page 4
Date: --/ -- /----
3. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge to
classify a new sample.
Program:
import math
import csv
def load_csv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset, headers
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def subtables(data, col, delete):

dic = {}
coldata = [ row[col] for row in data]
attr = list(set(coldata)) # All values of attribute retrived
for k in attr:
dic[k] = []
for y in range(len(data)):
key = data[y][col]
if delete:
del data[y][col]
dic[key].append(data[y])
return attr, dic
def entropy(S):
attr = list(set(S))
if len(attr) == 1: #if all are +ve/-ve then entropy = 0
return 0
counts = [0,0] # Only two values possible 'yes' or 'no'
for i in range(2):
counts[i] = sum( [1 for x in S if attr[i] == x] ) / (len(S) * 1.0)
sums = 0
for cnt in counts:
sums += -1 * cnt * math.log(cnt, 2)
return sums
def compute_gain(data, col):

attValues, dic = subtables(data, col, delete=False)
total_entropy = entropy([row[-1] for row in data])
for x in range(len(attValues)):
Page 5
Date: --/ -- /----
ratio = len(dic[attValues[x]]) / ( len(data) * 1.0)

entro = entropy([row[-1] for row in dic[attValues[x]]])
total_entropy -= ratio*entro
return total_entropy
def build_tree(data, features):

lastcol = [row[-1] for row in data]
if (len(set(lastcol))) == 1: # If all samples have same labels return
that label
node=Node("")
node.answer = lastcol[0]
return node
n = len(data[0])-1
gains = [compute_gain(data, col) for col in range(n) ]
split = gains.index(max(gains)) # Find max gains and returns index
node = Node(features[split]) # 'node' stores attribute selected
#del (features[split])
fea = features[:split]+features[split+1:]
attr, dic = subtables(data, split, delete=True)
for x in range(len(attr)):
child = build_tree(dic[attr[x]], fea)
node.children.append((attr[x], child))
return node
def print_tree(node, level):

if node.answer != "":
print(" "*level, node.answer) # Displays leaf node yes/no
return
print(" "*level, node.attribute) # Displays attribute Name
for value, n in node.children:
print(" "*(level+1), value)
print_tree(n, level + 2)
def classify(node,x_test,features):
if node.answer != "":
print(node.answer)
return
pos = features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)
dataset, features = load_csv("data3.csv") # Read Tennis data

node = build_tree(dataset, features) # Build decision tree
print("The decision tree for the dataset using ID3 algorithm is ")
print_tree(node, 0)
testdata, features = load_csv("data3_test.csv")
for xtest in testdata:
print("The test instance : ",xtest)
print("The predicted label : ", end="")
classify(node,xtest,features)
Page 6
Date: --/ -- /----
Output:
Page 7
Date: --/ -- /----
4. Build an Artificial Neural Network by implementing the Backpropagation algorithm

and test the same using appropriate data sets.
Program:
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0)
y = y/100
def sigmoid(x):
return 1/(1 + np.exp(-x))
def sigmoid_grad(x):
return x * (1 - x)
epoch=1000
eta =0.2
input_neurons = 2
hidden_neurons = 3
output_neurons = 1
wh=np.random.uniform(size=(input_neurons,hidden_neurons))
bh=np.random.uniform(size=(1,hidden_neurons))
wout=np.random.uniform(size=(hidden_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
h_ip=np.dot(X,wh) + bh
h_act = sigmoid(h_ip)
o_ip=np.dot(h_act,wout) + bout
output = sigmoid(o_ip)
Eo = y-output
outgrad = sigmoid_grad(output)
d_output = Eo* outgrad
Eh = d_output.dot(wout.T)
hiddengrad = sigmoid_grad(h_act)
d_hidden = Eh * hiddengrad
wout += h_act.T.dot(d_output) *eta
wh += X.T.dot(d_hidden) *eta
print("Normalized Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)
Output:
Page 8
Date: --/ -- /----
5. Write a program to implement the naïve Bayesian classifier for a sample training data
set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data
sets.
Program:
import csv, random, math

import statistics as st
def loadCsv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio):

testSize = int(len(dataset) * splitRatio);
trainSet = list(dataset);
testSet = []
while len(testSet) < testSize:
index = random.randrange(len(trainSet));
testSet.append(trainSet.pop(index))
return [trainSet, testSet]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
x = dataset[i]
if (x[-1] not in separated):
separated[x[-1]] = []
separated[x[-1]].append(x)
return separated
def compute_mean_std(dataset):
mean_std = [ (st.mean(attribute), st.stdev(attribute))
for attribute in zip(*dataset)];
del mean_std[-1]
return mean_std
def summarizeByClass(dataset):
separated = separateByClass(dataset);
summary = {}
for classValue, instances in separated.items():
summary[classValue] = compute_mean_std(instances)
return summary
def estimateProbability(x, mean, stdev):

exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, testVector):
Page 9
Date: --/ -- /----
p = {}
for classValue, classSummaries in summaries.items():
p[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = testVector[i]
p[classValue] *= estimateProbability(x, mean, stdev);
return p
def predict(summaries, testVector):

all_p = calculateClassProbabilities(summaries, testVector)
bestLabel, bestProb = None, -1
for lbl, p in all_p.items():
if bestLabel is None or p > bestProb:
bestProb = p
bestLabel = lbl
return bestLabel
def perform_classification(summaries, testSet):

predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):

correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0
dataset = loadCsv('data51.csv');
print('Pima Indian Diabetes Dataset loaded...')
print('Total instances available :',len(dataset))
print('Total attributes present :',len(dataset[0])-1)
print("First Five instances of dataset:")

for i in range(5):
print(i+1 , ':' , dataset[i])
splitRatio = 0.2
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('\nDataset is split into training and testing set.')
print('Training examples = {0} \nTesting examples =
{1}'.format(len(trainingSet), len(testSet)))
summaries = summarizeByClass(trainingSet);
predictions = perform_classification(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('\nAccuracy of the Naive Baysian Classifier is :', accuracy)
Page 10
Date: --/ -- /----
Output:
Page 11
Date: --/ -- /----
6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier
model to perform this task. Built-in Java classes/API can be used to write the program.
Calculate the accuracy, precision, and recall for your data set.
Program:
import pandas as pd
msg=pd.read_csv('data6.csv',names=['message','label'])
print('Total instances in the dataset:',msg.shape[0])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
Y=msg.labelnum
print('\nThe message and its label of first 5 instances are listed
below')
X5, Y5 = X[0:5], msg.label[0:5]
for x, y in zip(X5,Y5):
print(x,',',y)
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(X,Y)
print('\nDataset is split into Training and Testing samples')
print('Total training instances :', xtrain.shape[0])
print('Total testing instances :', xtest.shape[0])
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm = count_vect.transform(xtest)
print('\nTotal features extracted using
CountVectorizer:',xtrain_dtm.shape[1])
print('\nFeatures for first 5 training instances are listed below')
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_
names())
print(df[0:5])
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
print('\nClassstification results of testing samples are given
below')
for doc, p in zip(xtest, predicted):
pred = 'pos' if p==1 else 'neg'
print('%s -> %s ' % (doc, pred))
from sklearn import metrics

print('\nAccuracy metrics')
print('Accuracy of the classifer
is',metrics.accuracy_score(ytest,predicted))
print('Recall :',metrics.recall_score(ytest,predicted), '\nPrecison
:',metrics.precision_score(ytest,predicted))
Page 12
Date: --/ -- /----
Output:
Page 13
Date: --/ -- /----
8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set
for clustering using k-Means algorithm. Compare the results of these two algorithms and
comment on the quality of clustering. You can add Java/Python ML library classes/API in
the program
Program:
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y = pd.DataFrame(iris.target)
y.columns = ['Targets']
model = KMeans(n_clusters=3)
model.fit(X)
plt.figure(figsize=(14,14))
colormap = np.array(['red', 'lime', 'black'])
plt.subplot(2, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Real Clusters')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_],
s=40)
plt.title('K-Means Clustering')
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
gmm_y = gmm.predict(xs)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[gmm_y], s=40)
plt.title('GMM Clustering')
print('Observation: The GMM using EM algorithm based clustering matched
the true labels more closely than the Kmeans.')
Page 14
Date: --/ -- /----
Output:
Page 15
Date: --/ -- /----
9. Write a program to implement k-Nearest Neighbour algorithm to classify the iris data
set. Print both correct and wrong predictions. Java/Python ML library classes can be used
for this problem.
Program:
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
iris=datasets.load_iris()
print("Iris Data set loaded...")
x_train, x_test, y_train, y_test =

train_test_split(iris.data,iris.target,test_size=0.1)
print("Dataset is split into training and testing...")
print("Size of trainng data and its label",x_train.shape,y_train.shape)
print("Size of trainng data and its label",x_test.shape, y_test.shape)
for i in range(len(iris.target_names)):
print("Label", i , "-",str(iris.target_names[i]))
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(x_train, y_train)
y_pred=classifier.predict(x_test)
print("Results of Classification using K-nn with K=1 ")
for r in range(0,len(x_test)):
print(" Sample:", str(x_test[r]), " Actual-label:", str(y_test[r]), "
Predicted-label:", str(y_pred[r]))
print("Classification Accuracy :" , classifier.score(x_test,y_test));
Output:
Page 16
Date: --/ -- /----
10. Implement the non-parametric Locally Weighted Regression algorithm in order to fit
data points. Select appropriate data set for your experiment and draw graphs.
Program:
from numpy import *

import operator
from os import listdir
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np1
import numpy.linalg as np
from scipy.stats.stats import pearsonr
def kernel(point,xmat, k):

m,n = np1.shape(xmat)
weights = np1.mat(np1.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2))
return weights
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
data = pd.read_csv('data10.csv')
bill = np1.array(data.total_bill)
tip = np1.array(data.tip)
mbill = np1.mat(bill)
mtip = np1.mat(tip)
m= np1.shape(mbill)[1]
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T))
#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();
Page 17
Date: --/ -- /----
Output:
Page 18

15CSL76 Students

Uploaded by

Copyright:

Available Formats

15CSL76 Students

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

15CSL76 Students

Uploaded by

Copyright:

Available Formats

Date: --/ -- /----

h = [['0', '0', '0', '0', '0', '0']]

'Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same',True

Maximally Specific set

def more_general(h1, h2):

def fulfills(example, hypothesis):

def min_generalizations(h, x):

def min_specializations(h, domains, x):

def generalize_S(x, G, S):

S.update([h for h in Splus if

def specialize_G(x, domains, G, S):

G.update([h for h in Gminus if any([more_general(h, s)

print("Maximally specific hypotheses - S ")

with open('c1.csv') as csvFile:

def subtables(data, col, delete):

def compute_gain(data, col):

ratio = len(dic[attValues[x]]) / ( len(data) * 1.0)

def build_tree(data, features):

def print_tree(node, level):

dataset, features = load_csv("data3.csv") # Read Tennis data

4. Build an Artificial Neural Network by implementing the Backpropagation algorithm

import csv, random, math

def splitDataset(dataset, splitRatio):

def estimateProbability(x, mean, stdev):

def calculateClassProbabilities(summaries, testVector):

def predict(summaries, testVector):

def perform_classification(summaries, testSet):

def getAccuracy(testSet, predictions):

print("First Five instances of dataset:")

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics

import matplotlib.pyplot as plt

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =

from numpy import *

def kernel(point,xmat, k):

You might also like