Lab Programs Manual
Lab Programs Manual
Code:
import csv
def loadCsv(filename):
lines = csv.reader(open(filename, "r"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = dataset[i]
return dataset
attributes = ['Sky','Temp','Humidity','Wind','Water','Forecast']
print('Attributes =',attributes)
num_attributes = len(attributes)
filename = "ENJOYSPORT.csv"
dataset = loadCsv(filename)
print(dataset)
hypothesis=['0'] * num_attributes
print("Intial Hypothesis")
print(hypothesis)
print("The Hypothesis are")
for i in range(1,len(dataset)):
target = dataset[i][-1]
if(target == '1'):
for j in range(num_attributes):
if(hypothesis[j]=='0'):
hypothesis[j] = dataset[i][j]
if(hypothesis[j]!= dataset[i][j]):
hypothesis[j]='?'
print(i+1,'=',hypothesis)
print("Final Hypothesis")
print(hypothesis)
Output:
[['Sky', 'AirTemp', 'Humidity', 'Wind', 'Water', 'Forecast',
'EnjoySport'], ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same',
'1'], ['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', '1'],
['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', '0'], ['Sunny',
'Warm', 'High', 'Strong', 'Cool', 'Change', '1']]
Intial Hypothesis
['0', '0', '0', '0', '0', '0']
The Hypothesis are
2 = ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same']
3 = ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
4 = ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
5 = ['Sunny', 'Warm', '?', 'Strong', '?', '?']
Final Hypothesis
['Sunny', 'Warm', '?', 'Strong', '?', '?']
2)Candidate_elimination:
import numpy as np
import pandas as pd
data = pd.read_csv('ENJOYSPORT.csv')
concepts = np.array(data.iloc[:,0:-1])
print("\nInstances are:\n",concepts)
target = np.array(data.iloc[:,-1])
print("\nTarget Values are: ",target)
if target[i] == 0:
print("Instance is Negative ")
for x in range(len(specific_h)):
if h[x]!= specific_h[x]:
general_h[x][x] = specific_h[x]
else:
general_h[x][x] = '?'
Final Specific_h:
['Sunny' 'Warm' '?' 'Strong' '?' '?']
Final General_h:
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]
3)Decision Tree
import pandas as pd
import numpy as np
dataset= pd.read_csv('play_tennis.csv',names=['outlook','temperature','
humidity','wind','class'])
print(dataset)
def entropy(target_col):
elements,counts = np.unique(target_col,return_counts = True)
entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/
np.sum(counts))for i in range(len(elements))])
return entropy
def InfoGain(data,split_attribute_name,target_name="class"):
total_entropy = entropy(data[target_name])
vals,counts= np.unique(data[split_attribute_name],return_counts=Tru
e)
Weighted_Entropy = np.sum([(counts[i]/
np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i])
.dropna()[target_name]) for i in range(len(vals))])
Information_Gain = total_entropy - Weighted_Entropy
return Information_Gain
def ID3(data,originaldata,features,target_attribute_name="class",parent
_node_class = None):
if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return np.unique(originaldata[target_attribute_name])
[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=
True)[1])] elif len(features) ==0:
return parent_node_class
else:
parent_node_class = np.unique(data[target_attribute_name])
[np.argmax(np.unique(data[target_attribute_name],return_counts=True)
[1])]
item_values = [InfoGain(data,feature,target_attribute_name) for
feature in features]
best_feature_index = np.argmax(item_values)
best_feature = features[best_feature_index]
tree = {best_feature:{}}
features = [i for i in features if i != best_feature]
for value in np.unique(data[best_feature]):
value = value
sub_data = data.where(data[best_feature] == value).dropna()
subtree = ID3(sub_data,dataset,features,target_attribute_na
me,parent_node_class)
tree[best_feature][value] = subtree
return(tree)
tree = ID3(dataset,dataset,dataset.columns[:-1])
print(' \nDisplay Tree\n',tree)
Output:
outlook temperature humidity wind class
day outlook temp humidity wind play
D1 Sunny Hot High Weak No
D2 Sunny Hot High Strong No
D3 Overcast Hot High Weak Yes
D4 Rain Mild High Weak Yes
D5 Rain Cool Normal Weak Yes
D6 Rain Cool Normal Strong No
D7 Overcast Cool Normal Strong Yes
D8 Sunny Mild High Weak No
D9 Sunny Cool Normal Weak Yes
D10 Rain Mild Normal Weak Yes
D11 Sunny Mild Normal Strong Yes
D12 Overcast Mild High Strong Yes
D13 Overcast Hot Normal Weak Yes
D14 Rain Mild High Strong No
Display Tree
{'outlook': {'Overcast': 'Yes', 'Rain': {'wind': {'Strong': 'No',
'Weak': 'Yes'}}, 'Sunny': {'humidity': {'High': 'No', 'Normal':
'Yes'}}, 'outlook': 'play'}}
4)Backpropagation algorithm
import numpy as np
#Sigmoid Function
def sigmoid (x):
return 1/(1 + np.exp(-x))
#Derivative of Sigmoid Function
def derivatives_sigmoid(x):
return x * (1 - x)
#Variable initialization
epoch=5 #Setting training iterations
lr=0.1 #Setting learning rate
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO * outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act)#how much hidden layer
wts contributed to error
d_hiddenlayer = EH * hiddengrad
Input Dataset:
X Y class
2 9 92
1 5 86
3 6 89
Output:
----------Epoch- 1 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.81946901]
[0.80312503]
[0.82285168]]
-----------Epoch- 1 Ends----------
-----------Epoch- 2 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.82027619]
[0.80391667]
[0.82366284]]
-----------Epoch- 2 Ends----------
-----------Epoch- 3 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.82106961]
[0.80469506]
[0.82446007]]
-----------Epoch- 3 Ends----------
-----------Epoch- 4 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.82184962]
[0.80546054]
[0.82524371]]
-----------Epoch- 4 Ends----------
-----------Epoch- 5 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.82261656]
[0.80621342]
[0.8260141 ]]
-----------Epoch- 5 Ends----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.82261656]
[0.80621342]
[0.8260141 ]]
5.Bayes classification
import csv
import random
import math
import numpy as np
def read_data(filename):
with open(filename,'r')as csvfile:
datareader =csv.reader(csvfile)
metadata = next(datareader)
traindata=[]
for row in datareader:
traindata.append(row[1:len(row)])
return (metadata,traindata)
def splitDataset(dataset,splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
testset =list(dataset)
i=0
while len(trainSet) < trainSize:
trainSet.append(testset.pop(i))
return [trainSet,testset ]
def classify(data,test):
total_size = data.shape[0]
print("\n")
print("training data size=",total_size)
print("test data size=",test.shape[0])
countYes = 0
countNo = 0
probYes = 0
probNo = 0
print("\n")
print("target count probability")
for x in range(data.shape[0]):
if data[x,data.shape[1]-1] == 'Yes':
countYes +=1
if data[x,data.shape[1]-1] == 'No':
countNo +=1
probYes=countYes/total_size
probNo=countNo/ total_size
print('YES',"\t",countYes,"\t",probYes)
print('No',"\t",countNo,"\t",probNo)
prob0 =np.zeros((test.shape[1]-1))
prob1 =np.zeros((test.shape[1]-1))
accuracy=0
print("\n")
print("instance prediction target")
for t in range(test.shape[0]):
for k in range(test.shape[1]-1):
count1=count0=0
for j in range (data.shape[0]):
#how many times appeared with no
if test[t,k] == data[j,k] and data[j,data.shape[1]-
1]=='No':
count0+=1
#how many times appeared with yes
if test[t,k]==data[j,k] and data[j,data.shape[1]-
1]=='Yes':
count1+=1
prob0[k]=count0/countNo
prob1[k]=count1/countYes
probno=probNo
probyes=probYes
for i in range(test.shape[1]-1):
probno=probno*prob0[i]
probyes=probyes*prob1[i]
if probno>probyes:
predict='No'
else:
predict='Yes'
print(t+1,"\t",predict,"\t ",test[t,test.shape[1]-1])
if predict == test[t,test.shape[1]-1]:
accuracy+=1
final_accuracy=(accuracy/test.shape[0])*100
print("accuracy",final_accuracy,"%")
return
metadata,traindata=read_data("play_tennis.csv")
print(traindata)
print("the attribute names of training data are:",metadata)
splitRatio=0.6
trainingset, testset=splitDataset(traindata, splitRatio)
training=np.array(trainingset)
print("\n the training data set are:")
for x in trainingset:
print(x)
testing=np.array(testset)
print("\n the test data set are:")
for x in testing:
print(x)
classify(training,testing)
Output:
[['Sunny', 'Hot', 'High', 'Weak', 'No'], ['Sunny', 'Hot', 'High', 'Strong', 'No'], ['Overcast',
'Hot', 'High', 'Weak', 'Yes'], ['Rain', 'Mild', 'High', 'Weak', 'Yes'], ['Rain', 'Cool', 'Normal',
'Weak', 'Yes'], ['Rain', 'Cool', 'Normal', 'Strong', 'No'], ['Overcast', 'Cool', 'Normal',
'Strong', 'Yes'], ['Sunny', 'Mild', 'High', 'Weak', 'No'], ['Sunny', 'Cool', 'Normal', 'Weak',
'Yes'], ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'], ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
['Overcast', 'Mild', 'High', 'Strong', 'Yes'], ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
['Rain', 'Mild', 'High', 'Strong', 'No']]
the attribute names of training data are: ['day', 'outlook', 'temp', 'humidity', 'wind', 'play']
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
msg=pd.read_csv('text_classification.csv',names=['message','label'])
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
print('\n the total number of training data:',ytrain.shape)
print('\n the total number of test data:',ytest.shape)
cv = CountVectorizer()
xtrain_dtm = cv.fit_transform(xtrain)
xtest_dtm=cv.transform(xtest)
print('\n the words or tokens in the text documents\n')
print(cv.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=cv.get_feature_names())
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
print('\n Accuracy of the classifier is
',metrics.accuracy_score(ytest,predicted))
print('\n confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n the value of precision
',metrics.precision_score(ytest,predicted))
print('\n the value of recall',metrics.recall_score(ytest,predicted))
Output:
the dimension of the dataset (18, 2)
confusion matrix
[[2 1]
[1 1]]
heartDisease = pd.read_csv('heart.csv')
heartDisease = heartDisease.replace('?',np.nan)
output:
Output:
observation :the GMM using EM algorithm based clustering matched the true
labels are closely than the kmeans
9)KNN algorithm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
#load dataset
iris=datasets.load_iris()
print("iris data set loaded...")
#split the data into train and test samples
X_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,
test_size=0.2)
print("Data set is split into traning and testing..")
print("size of traning data and its label",X_train.shape,y_train.shape)
print("size of testing data and its label",x_test.shape,y_test.shape)
#print label no. and their names
for i in range(len(iris.target_names)):
print("label" , i , "-",str(iris.target_names[i]))
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(x_test)
print("results of classification using K-NN with k=1")
for r in range(0,len(x_test)):
print("sample:",str(x_test[r]),"Actual-
label:",str(y_test[r]),"predicted-label:",str(y_pred[r]))
print("classification accuracy:",classifier.score(x_test,y_test));
Output:
iris data set loaded...
Data set is split into traning and testing..
size of traning data and its label (120, 4) (120,)
size of testing data and its label (30, 4) (30,)
label 0 - setosa
label 1 - versicolor
label 2 - virginica
results of classification using K-NN with k=1
sample: [5.8 2.7 4.1 1. ] Actual-label: 1 predicted-label: 1
sample: [5.1 3.8 1.9 0.4] Actual-label: 0 predicted-label: 0
sample: [6.4 3.2 4.5 1.5] Actual-label: 1 predicted-label: 1
sample: [6.1 2.8 4.7 1.2] Actual-label: 1 predicted-label: 1
sample: [5.2 3.4 1.4 0.2] Actual-label: 0 predicted-label: 0
sample: [5.8 2.7 5.1 1.9] Actual-label: 2 predicted-label: 2
sample: [6.3 2.8 5.1 1.5] Actual-label: 2 predicted-label: 1
sample: [7.1 3. 5.9 2.1] Actual-label: 2 predicted-label: 2
sample: [6.7 2.5 5.8 1.8] Actual-label: 2 predicted-label: 2
sample: [6.8 2.8 4.8 1.4] Actual-label: 1 predicted-label: 1
sample: [5.1 3.7 1.5 0.4] Actual-label: 0 predicted-label: 0
sample: [5. 3.6 1.4 0.2] Actual-label: 0 predicted-label: 0
sample: [6.5 3. 5.8 2.2] Actual-label: 2 predicted-label: 2
sample: [6. 2.7 5.1 1.6] Actual-label: 1 predicted-label: 2
sample: [5.1 3.3 1.7 0.5] Actual-label: 0 predicted-label: 0
sample: [6.8 3.2 5.9 2.3] Actual-label: 2 predicted-label: 2
sample: [5.1 3.8 1.5 0.3] Actual-label: 0 predicted-label: 0
sample: [5.3 3.7 1.5 0.2] Actual-label: 0 predicted-label: 0
sample: [5.4 3.9 1.7 0.4] Actual-label: 0 predicted-label: 0
sample: [6.9 3.1 4.9 1.5] Actual-label: 1 predicted-label: 1
sample: [4.9 3.1 1.5 0.1] Actual-label: 0 predicted-label: 0
sample: [4.4 2.9 1.4 0.2] Actual-label: 0 predicted-label: 0
sample: [7.6 3. 6.6 2.1] Actual-label: 2 predicted-label: 2
sample: [6.3 3.4 5.6 2.4] Actual-label: 2 predicted-label: 2
sample: [5. 2. 3.5 1. ] Actual-label: 1 predicted-label: 1
sample: [6.5 3. 5.5 1.8] Actual-label: 2 predicted-label: 2
sample: [5.6 2.9 3.6 1.3] Actual-label: 1 predicted-label: 1
sample: [6.8 3. 5.5 2.1] Actual-label: 2 predicted-label: 2
sample: [6. 2.9 4.5 1.5] Actual-label: 1 predicted-label: 1
sample: [5.5 2.4 3.8 1.1] Actual-label: 1 predicted-label: 1
classification accuracy: 0.9333333333333333
confusion matrix
[[10 0 0]
[ 0 9 1]
[ 0 1 9]]
Accuracy Metrics
precision recall f1-score support
accuracy 0.93 30
macro avg 0.93 0.93 0.93 30
weighted avg 0.93 0.93 0.93 30
10)LWR algorithm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
for j in range(m):
diff = point - X[j]
weights[j, j] = np.exp(diff * diff.T / (-2.0 * k**2))
return weights
for i in range(m):
ypred[i] = xmat[i] * localWeight(xmat[i], xmat, ymat, k)
return ypred
#import data
data = pd.read_csv('10-dataset.csv')
mcolA = np.mat(colA)
mcolB = np.mat(colB)
m = np.shape(mcolB)[1]
one = np.ones((1, m), dtype = int)
# horizontal stacking
X = np.hstack((one.T, mcolA.T))
print(X.shape)
Output: