B.TECH Machine Learning-Lab
B.TECH Machine Learning-Lab
Practical file
of
Basics of Machine Learning
SUBMITTED TO :- SUBMITTED BY :-
Implement and demonstrate the FIND-S algorithm for finding the most
specific hypothesis based on a given set of training data samples. Read the
training data from a .CSV file.
import csv
with open('tennis.csv',
'r') as f: reader =
csv.reader(f) your_list
= list(reader)
if i[-1] ==
"True": j
=0
for x in i:
if x != "True":
else:
p
ass
j=
j+
1
print("Most specific
hypothesis is") print(h)
Output
For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the set
of all hypotheses consistent with the training examples.
class Holder:
'''
'''
self.factors[i]=[]
def add_values(self,factor,values):
self.factors[factor]=values
class CandidateElimination:
Positive={} #Initialize positive empty dictionary
Negative={} #Initialize negative empty dictionary
self.attr = fact.attributes
self.dataset = data
Initialize the specific and general boundaries, and loop the dataset against the
algorithm
'''
G = self.initializeG() S
= self.initializeS()
'''
count=0
for trial_set in self.dataset:
for s in S:
if not self.consistent(s,trial_set[0]):
S_new.remove(s)
generalization = self.generalize_inconsistent_S(s,trial_set[0])
generalization = self.get_general(generalization,G)
if generalization:
S_new.append(generalization)
S = S_new[:]
S = self.remove_more_general(S)
print(S)
else:#if it is negative
G_new = G[:] #initialize the dictionary with no key-value pair (dataset can
take any value)
if self.consistent(g,trial_set[0]):
G_new.remove(g)
specializations = self.specialize_inconsistent_G(g,trial_set[0])
specializationss = self.get_specific(specializations,S)
if specializations != []: G_new
+= specializationss
G = G_new[:]
G = self.remove_more_specific(G)
print(G)
print (S)
print (G)
def initializeS(self):
def initializeG(self):
def is_positive(self,trial_set):
else:
def match_factor(self,value1,value2):
return False
def consistent(self,hypothesis,instance):
if not self.match_factor(factor,instance[i]):
return False
return True
def remove_inconsistent_G(self,hypotheses,instance): '''
For a positive trial_set, the hypotheses in G
for g in hypotheses:
if not self.consistent(g,instance):
G_new.remove(g)
return G_new
for s in hypotheses:
if self.consistent(s,instance):
S_new.remove(s)
return S_new
def remove_more_general(self,hypotheses):
return S_new
def remove_more_specific(self,hypotheses):
G_new = hypotheses[:]
for old in hypotheses: for
new in G_new:
def generalize_inconsistent_S(self,hypothesis,instance):
''' When a inconsistent hypothesis for positive trial_set is seen in the specific boundary
S,
it should be generalized to be consistent with the trial_set ... we will get one
hypothesis'''
if factor == '-':
hypo[i] = instance[i]
elif not self.match_factor(factor,instance[i]):
hypo[i] = '?'
def specialize_inconsistent_G(self,hypothesis,instance):
''' When a inconsistent hypothesis for negative trial_set is seen in the general boundary
G
should be specialized to be consistent with the trial_set.. we will get a set of
hypotheses '''
specializations = []
if factor == '?':
values = self.factors[self.attr[i]] for
j in values:
if instance[i] != j:
hyp=hypo[:]
hyp[i]=j
return specializations
def get_general(self,generalization,G):
for g in G:
if self.more_general(g,generalization):
return generalization
return None
def get_specific(self,specializations,S):
valid_specializations = [] for
hypo in specializations:
for s in S:
if self.more_specific(s,hypo) or s==self.initializeS()[0]:
valid_specializations.append(hypo)
return valid_specializations
def exists_general(self,hypothesis,G):
for g in G:
if self.more_general(g,hypothesis):
return True
return False
def exists_specific(self,hypothesis,S):
'''Used to check if there exists a more specific hypothesis in
general boundary for version space'''
for s in S:
if self.more_specific(s,hypothesis):
return True
return False
def more_general(self,hyp1,hyp2):
continue
elif j == '?':
if i != '?': return
False
elif i != j: return
False
else:
continue
return True
attributes =('Sky','Temp','Humidity','Wind','Water','Forecast') f =
Holder(attributes)
a = CandidateElimination(dataset,f) #pass the dataset to the algorithm class and call the
run algoritm method
a.run_algorithm()
Output
[('sunny', 'warm', 'normal', 'strong', 'warm', 'same')]
[('sunny', 'warm', 'normal', 'strong', 'warm','same')]
[('sunny', 'warm', '?', 'strong', 'warm', 'same')]
[('?', '?', '?', '?', '?', '?')]
[('sunny', '?', '?', '?', '?', '?'), ('?', 'warm', '?', '?', '?', '?'), ('?', '?', '?', '?', '?', 'same')]
[('sunny', 'warm', '?', 'strong', 'warm', 'same')]
[('sunny', 'warm', '?', 'strong', '?', '?')]
[('sunny', 'warm', '?', 'strong', '?', '?')]
[('sunny', '?', '?', '?', '?', '?'), ('?', 'warm', '?', '?', '?', '?')]
Program-3
Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and apply this
knowledge to classify a new sample.
import numpy as np
import math
class Node:
for x in range(items.shape[0]):
for y in range(data.shape[0]):
pos = 0
for y in range(data.shape[0]): if
data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
def entropy(S):
items = np.unique(S) if
items.size == 1:
return 0
for x in range(items.shape[0]):
total_size = data.shape[0]
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
empty(size):
s = ""
for x in range(size): s
+= " "
return s
print(empty(level), node.answer)
return
Data_loader.py
import csv
def read_data(filename):
metadata = []
traindata = []
outlook,temperature,humidity,wind,
answer sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no
Output
outlook
overcast
b'yes' rain
wind
b'strong'
b'no' b'weak'
b'yes' sunny
humidity
b'high' b'no'
b'normal'
b'yes
Program-4
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
#Sigmoid Function
def sigmoid (x):
return x * (1 - x)
#Variable initialization
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
#Forward Propogation
hinp1=np.dot(X,wh)
hinp=hinp1 + bh hlayer_act =
sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+ bout
output = sigmoid(outinp)
#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO* outgrad
EH = d_output.dot(wout.T)
Input:
[[ 0.66666667 1. ]
[ 0.33333333
0.55555556]
[ 1. 0.66666667]]
Actual Output:
[[ 0.92]
[ 0.86]
[ 0.89]]
Predicted Output:
[[ 0.89559591]
[ 0.88142069]
[ 0.8928407 ]]
Program-5
Write a program to implement the naïve Bayesian classifier for a sample training
data set stored as a .CSV file. Compute the accuracy of the classifier, considering
few test data sets.
def loadCsv(filename):
for i in range(len(dataset)):
copy = list(dataset);
while len(trainSet) < trainSize:
#generate indices for the dataset list randomly to pick ele for training data
index = random.randrange(len(copy));
trainSet.append(copy.pop(index))
separated = {}
#creates a dictionary of classes 1 and 0 where the values are the instacnes belonging to
each class
for i in range(len(dataset)):
vector = dataset[i]
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset);
summaries = {}
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
probabilities = {}
normal dist
return probabilities
highest prob
bestLabel = classValue
return bestLabel
for i in range(len(testSet)):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
def main():
filename = '5data.csv'
splitRatio = 0.67
dataset = loadCsv(filename);
trainingSet, testSet = splitDataset(dataset, splitRatio)
# prepare model
summaries = summarizeByClass(trainingSet); #
test model
main()
Output
confusion matrix is as
follows [[17 0 0]
[ 0 17 0]
[ 0 0 11]]
Accuracy metrics
Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to perform this task. Built-in Java classes/API can be used to
write the program. Calculate the accuracy, precision, and recall for your data set.
import pandas as pd
msg=pd.read_csv('naivetext1.csv',names=['message','label']) print('The
dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
print(X)
print(y)
print(xtrain.shape)
print(ytest.shape)
print(ytrain.shape)
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
print(count_vect.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
print(df)#tabular representation
print(xtrain_dtm) #sparse matrix representation
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
print(metrics.confusion_matrix(ytest,predicted))
print('Recall and Precison ')
print(metrics.recall_score(ytest,predicted))
print(metrics.precision_score(ytest,predicted))
'''docs_new = ['I like this place', 'My boss is not my saviour']
X_new_counts = count_vect.transform(docs_new)
predictednew = clf.predict(X_new_counts)
['about', 'am', 'amazing', 'an', 'and', 'awesome', 'beers', 'best', 'boss', 'can', 'deal',
'do', 'enemy', 'feel', 'fun', 'good', 'have', 'horrible', 'house', 'is', 'like', 'love', 'my',
'not', 'of', 'place', 'restaurant', 'sandwich', 'sick', 'stuff', 'these', 'this', 'tired', 'to',
'today', 'tomorrow', 'very', 'view', 'we', 'went', 'what', 'will', 'with', 'work'] about
am amazing an and awesome beers best boss can ... today \
0 10 00001 0 0 0 ... 0
1 00 00000 1 0 0 ... 0
2 00 1100 0 0 0 0 ... 0
3 00 00000 0 0 0 ... 1
4 00 00000 0 0 0 ... 0
5 01 001 0 0 0 0 0 ... 0
6 00 00000 0 0 1 ... 0
7 00 00000 0 0 0 ... 0
8 01 00000 0 0 0 ... 0
0 10 00001 0 0 0 ... 0
9 00 01010 0 0 0 ... 0
10 0 0 0 0 0 0 0 0 0 0 ... 0
11 0 0 000 0 0 0 1 0 ... 0
1 0 0 0 0 00 0 0 1
2 0 0 0 0 00 0 0 0
3 0 0 0 0 10 0 0 0
1 0 0 0 0 00 0 0 1
4 0 0 0 0 00 0 0 0
5 0 0 0 0 00 0 0 0
6 0 0 0 0 00 0 1 0
7 1 0 0 1 00 1 0 0
8 0 0 0 0 00 0 0 0
Program-7
Write a program to construct a Bayesian network considering medical data. Use this model
to demonstrate the diagnosis of heart patients using the standard Heart Disease Data Set.
You can use Java/Python ML library classes/API.
import numpy as np
import csv
import pandas as pd from pgmpy.models
import BayesianModel from pgmpy.estimators
import MaximumLikelihoodEstimator from pgmpy.inference
import VariableElimination
Output
Program-8
Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data
set for clustering using the k-Means algorithm. Compare the results of these two
algorithms and comment on the quality of clustering. You can add Java/Python
ML library classes/API in the program.
import numpy as np
import matplotlib.pyplot as plt
X = X[:, ::-1]
print(probs[:5].round(3))
Ax = ax or plt.gca()
# Convert covariance to principal axes
if covariance.shape ==(2,2):
U, s, Vt = np.linalg.svd(covariance)
else:
angle = 0
width, height = 2 * np.sqrt(covariance)
labels = gmm.fit(X).predict(X) if
label:
ax.scatter(X[:, 0], x[:, 1], c=labels, s=40, cmap=‟viridis‟, zorder=2)
else:
plot_gmm(gmm, X)
Output
[[1 ,0, 0, 0]
[0 ,0, 1, 0]
[1 ,0, 0, 0]
[1 ,0, 0, 0]
[1 ,0, 0, 0]]
K-means
from sklearn.cluster import KMeans
print(df1)
f1 = df1['Distance_Feature'].values f2 =
df1['Speeding_Feature'].values
X=np.matrix(list(zip(f1,f2)))
plt.plot()
plt.xlim([0, 100])
plt.show()
# KMeans algorithm #K
=3
kmeans_model = KMeans(n_clusters=3).fit(X)
plt.plot()
for i, l in enumerate(kmeans_model.labels_):
plt.ylim([0, 50])
plt.show()
Driver_ID,Distance_Feature,Sp
eeding_Feature
3423311935,71.24,28
3423313212,52.53,25
3423313724,64.54,27
3423311373,55.69,22
3423310999,54.58,25
3423313857,41.91,10
3423312432,58.64,20
3423311434,52.02,8
3423311328,31.25,34
3423312488,44.31,19
3423311254,49.35,40
3423312943,58.07,45
3423312536,44.22,22
3423311542,55.73,19
3423312176,46.63,43
3423314176,52.97,32
3423314202,46.25,35
3423311346,51.55,27
3423310666,57.05,26
3423313527,58.45,30
3423312182,43.42,23
3423313590,55.68,37
3423312268,55.15,18
Program-9
Write a program to implement k-Nearest Neighbour algorithm to classify the iris
data set. Print both correct and wrong predictions. Java/Python ML library classes
can be used for this problem.
lines = csv.reader(csvfile)
dataset = list(lines)
dataset[x][y] = float(dataset[x][y]) if
random.random() < split:
trainingSet.append(dataset[x]) else:
testSet.append(dataset[x])
for x in range(length):
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(distances[x][0])
return neighbors
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1] if
response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes =
sorted(classVotes.iteritems(),
reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet,
predictions): correct = 0 for x
in range(len(testSet)):
key=operator.itemgetter(1
),
if testSet[x][-1] == predictions[x]:
correct += 1
def main():
# prepare data
trainingSet= []
testSet=[] split
= 0.67
# generate
predictions
predictions=[]
k=3
for x in range(len(testSet)):
'%') main()
OUTPUT
Confusion matrix is as follows
[[11 0 0]
[0 9 1]
[0 1 8]]
Accuracy metrics 0
weights = np1.mat(np1.eye((m)))
for j in range(m):
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2))
return weights
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T)) return
W
def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
data = pd.read_csv('data10.csv')
bill = np1.array(data.total_bill) tip
= np1.array(data.tip)
one = np1.mat(np1.ones(m)) X=
np1.hstack((one.T,mbill.T))
#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
Output