ML Lab
ML Lab
Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesis based on a given set of training data samples. Read the training data
from a .CSV file.
trainingdata.csv(IN EXCELSHEET CSV FORMAT)
Sky airTemp humidity wind water forecast enjoySport
Sunny Warm Normal Strong Warm Same Yes
Sunny Warm High Strong Warm Same Yes
Rainy Cold High Strong Warm Change No
Sunny Warm High Strong Cool Change Yes
INPUT
import csv
hypo = ['%','%','%','%','%','%'];
data = []
print("\nThe given training examples are:")
for row in readcsv:
print(row)
if row[len(row)-1].upper() == "YES":
data.append(row)
print("\nThe positive examples are:");
for x in data:
print(x);
print("\n");
TotalExamples = len(data);
i=0;
j=0;
k=0;
print("The steps of the Find-s algorithm are :\n",hypo);
list = [];
p=0;
d=len(data[p])-1;
for j in range(d):
list.append(data[i][j]);
hypo=list;
i=1;
for i in range(TotalExamples):
for k in range(d):
if hypo[k]!=data[i][k]:
hypo[k]='?';
k=k+1;
else:
hypo[k];
print(hypo);
i=i+1;
print("\nThe maximally specific Find-s hypothesis for the given training
examples is :");
list=[];
for i in range(d):
list.append(hypo[i]);
print(list);
OUTPUT
<_csv.reader object at 0x0000001781862BA8>
2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the
set of all hypotheses consistent with the training examples.
trainingdata.csv(IN EXCELSHEET CSV FORMAT)
Sky airTemp humidity wind water forecast enjoySport
Sunny Warm Normal Strong Warm Same Yes
Sunny Warm High Strong Warm Same Yes
Rainy Cold High Strong Warm Change No
Sunny Warm High Strong Cool Change Yes
INPUT
import numpy as np
import pandas as pd
# Loading Data from a CSV File
data = pd.DataFrame(data=pd.read_csv('trainingdata.csv'))
print(data)
# Separating concept features from Target
concepts = np.array(data.iloc[:,0:-1])
print(concepts)
# Isolating target into a separate DataFrame
# copying last column to target array
target = np.array(data.iloc[:,-1])
print(target)
def learn(concepts, target):
'''
learn() function implements the learning method of the Candidate
elimination algorithm.
Arguments:
concepts - a data frame with all the features
target - a data frame with corresponding output values
'''
# find indices where we have empty rows, meaning those that are unchanged
indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?',
'?', '?', '?']]
for i in indices:
# remove those rows from general_h
general_h.remove(['?', '?', '?', '?', '?', '?'])
# Return final values
return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("\nFinal Specific_h:", s_final, sep="\n")
print("\nFinal General_h:", g_final, sep="\n")
OUTPUT
sky airTemp humidity wind water forecast enjoySport
0 Sunny Warm Normal Strong Warm Same Yes
1 Sunny Warm High Strong Warm Same Yes
2 Rainy Cold High Strong Warm Change No
3 Sunny Warm High Strong Cool Change Yes
Final Specific_h:
['Sunny' 'Warm' '?' 'Strong' '?' '?']
Final General_h:
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|
S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
if items.size == 1:
return 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1))
intrinsic = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1])
iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
node.answer = np.unique(data[:, -1])[0]
return node
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
# scale units
X = X/np.amax(X, axis=0) # maximum of X array
y = y/100 # max test score is 100
class Neural_Network(object):
def __init__(self):
# Parameters
self.inputSize = 2
self.outputSize = 1
self.hiddenSize = 3
# Weights
self.W1 = np.random.randn(self.inputSize, self.hiddenSize) #
(3x2) weight matrix from input to hidden layer
self.W2 = np.random.randn(self.hiddenSize, self.outputSize) #
(3x1) weight matrix from hidden to output layer
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.47212874]
[0.42728946]
[0.40891365]]
Loss:
0.20642371917499927
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.55398066]
[0.49831918]
[0.50254468]]
Loss:
0.13830159742519685
333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.90738512]
[0.85762296]
[0.90442507]]
Loss:
0.0001242893939220947
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.90738861]
[0.85762488]
[0.90442029]]
Loss:
0.0001242110058786677
..
..
..
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.90939036]
[0.85857836]
[0.90083978]]
Loss:
7.736212176292079e-05
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.90939277]
[0.85857961]
[0.90083551]]
Loss:
7.731308968994962e-05
INPUT
# import necessary libarities
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
le_Temperature = LabelEncoder()
X.Temperature = le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity = le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy = le_Windy.fit_transform(X.Windy)
6. Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to perform this task. Built-in Java classes/API can be used to write
the program. Calculate the accuracy, precision, and recall for your data set.
document.csv(IN EXCELSHEET CSV FORMAT)
INPUT
import pandas as pd
msg = pd.read_csv('document.csv', names=['message', 'label'])
print("Total Instances of Dataset: ", msg.shape[0])
msg['labelnum'] = msg.label.map({'pos': 1, 'neg': 0})
X = msg.message
y = msg.labelnum
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
from sklearn.feature_extraction.text import CountVectorizer
count_v = CountVectorizer()
Xtrain_dm = count_v.fit_transform(Xtrain)
Xtest_dm = count_v.transform(Xtest)
df = pd.DataFrame(Xtrain_dm.toarray(),columns=count_v.get_feature_names())
print(df[0:5])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(Xtrain_dm, ytrain)
pred = clf.predict(Xtest_dm)
for doc, p in zip(Xtrain, pred):
p = 'pos' if p == 1 else 'neg'
print("%s -> %s" % (doc, p))
about am an and awesome bad beers best boss can ... tired to \
0 0 1 0 1 0 0 0 0 0 0 ... 1 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0
3 0 0 0 0 0 0 0 0 0 1 ... 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0
[5 rows x 49 columns]
Accuracy Metrics:
Accuracy: 0.6
Recall: 0.5
Precision: 1.0
Confusion Matrix:
[[1 0]
[2 2]]
import pandas as pd
data=pd.read_csv("heartdisease.csv")
heart_disease=pd.DataFrame(data)
print(heart_disease)
from pgmpy.models import BayesianModel
model=BayesianModel([
('age','Lifestyle'),
('Gender','Lifestyle'),
('Family','heartdisease'),
('diet','cholestrol'),
('Lifestyle','diet'),
('cholestrol','heartdisease'),
('diet','cholestrol')
])
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={
'age':int(input('Enter age :')),
'Gender':int(input('Enter Gender :')),
'Family':int(input('Enter Family history :')),
'diet':int(input('Enter diet :')),
'Lifestyle':int(input('Enter Lifestyle :')),
'cholestrol':int(input('Enter cholestrol :'))
})
print(q['heartdisease'])
OUTPUT
age Gender Family diet Lifestyle cholestrol heartdisease
0 0 0 1 1 3 0 1
1 0 1 1 1 3 0 1
2 1 0 0 0 2 1 1
3 4 0 1 1 3 2 0
4 3 1 1 0 0 2 0
5 2 0 1 1 1 0 1
6 4 0 1 0 2 0 1
7 0 0 1 1 3 0 1
8 3 1 1 0 0 2 0
9 1 1 0 0 0 2 1
10 4 1 0 1 2 0 1
11 4 0 1 1 3 2 0
12 2 1 0 0 0 0 0
13 2 0 1 1 1 0 1
14 3 1 1 0 0 1 0
15 0 0 1 0 0 2 1
16 1 1 0 1 2 1 1
17 3 1 1 1 0 1 0
18 4 0 1 1 3 2 0
8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data set for clustering using k-Means algorithm. Compare the results of these two
algorithms and comment on the quality of clustering. You can add Java/Python ML
library classes/API in the program.
INPUT
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_iris
import sklearn.metrics as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=load_iris()
# print(dataset)
X=pd.DataFrame(dataset.data)
X.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y=pd.DataFrame(dataset.target)
y.columns=['Targets']
# print(X)
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
# REAL PLOT
plt.subplot(1,3,1)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y.Targets],s=40)
plt.title('Real')
# K-PLOT
plt.subplot(1,3,2)
model=KMeans(n_clusters=3)
model.fit(X)
predY=np.choose(model.labels_,[0,1,2]).astype(np.int64)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[predY],s=40)
plt.title('KMeans')
# GMM PLOT
scaler=preprocessing.StandardScaler()
scaler.fit(X)
xsa=scaler.transform(X)
xs=pd.DataFrame(xsa,columns=X.columns)
gmm=GaussianMixture(n_components=3)
gmm.fit(xs)
y_cluster_gmm=gmm.predict(xs)
plt.subplot(1,3,3)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm],s=40)
plt.title('GMM Classification')
OUTPUT
Text(0.5, 1.0, 'GMM Classification')
9. Write a program to implement k-Nearest Neighbour algorithm to classify the
iris data set. Print both correct and wrong predictions. Java/Python ML library
classes can be used for this problem.
INPUT
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
dataset=load_iris()
#print(dataset)
X_train,X_test,y_train,y_test=train_test_split(dataset["data"],dataset["target
"],random_state=0)
kn=KNeighborsClassifier(n_neighbors=1)
kn.fit(X_train,y_train)
for i in range(len(X_test)):
x=X_test[i]
x_new=np.array([x])
prediction=kn.predict(x_new)
print("TARGET=",y_test[i],dataset["target_names"]
[y_test[i]],"PREDICTED=",prediction,dataset["target_names"][prediction])
print(kn.score(X_test,y_test))
OUTPUT
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='uniform')
residuals = y - yest
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
return yest
import math
n = 100
x = np.linspace(0, 2 * math.pi, n)
y = np.sin(x) + 0.3 * np.random.randn(n)
f =0.25
iterations=3
yest = lowess(x, y, f, iterations)
OUTPUT
[<matplotlib.lines.Line2D at 0x37459696d8>]