ML Lab Programs
ML Lab Programs
#lab_1_single_instance
import csv
import numpy as np
with open('training-data-1.csv','r') as file:
reader=csv.reader(file)
x1=[row for row in reader]
x=np.array(x1)
r,c=x.shape
print("The no of rows & cols are:",r,"and",c)
print("The dataset is:\n",x)
s=[]
s=np.empty(c-1,dtype=object)
for i in range(c-1):
s[i]="\u03A6"
print("Initial specific hypothesis:\n",s)
for i in range(c-1):
s[i]=x[0,i]
print("Hypothesis after processing the training data:\n",s)
for i in range(1,r):
if x[i,c-1]=='NO':
continue
else:
for j in range(c-1):
if x[i,j]!=s[j]:
s[j]='?'
print("The Intermediate Hypothesis:\n",s)
print("The Final MAximal Specific ypothesis:\n",s)
print("ENter new instance value")
new=[]
new=np.empty(c-1,dtype=object)
print("Enter Attribute value for sky,airtemp,humidity,wing,water,and forecast")
for i in range(c-1):
new[i]=input("Enter attribute value")
print("\nThe given instance is:",new)
flag=1
for i in range(c-1):
if s[i]!='?' and s[i]!=new[i]:
flag=0
break
print("\nThe assigned label for Enjoying Water Sort is")
if flag==1:
print("Yes means person enjoys water sport")
else:
print("No means person will not enjoys water sport")
#lab_1b_Multiple_Instance
import csv
import numpy as np
with open('training-data-1.csv','r') as file:
reader=csv.reader(file)
x1=[row for row in reader]
x=np.array(x1)
r,c=x.shape
print("The no of rows & cols are:",r,"and",c)
print("The dataset is:\n",x)
s=[]
s=np.empty(c-1,dtype=object)
for i in range(c-1):
s[i]="\u03A6"
print("Initial specific hypothesis:\n",s)
for i in range(c-1):
s[i]=x[0,i]
print("Hypothesis after processing the training data:\n",s)
for i in range(1,r):
if x[i,c-1]=='NO':
continue
else:
for j in range(c-1):
if x[i,j]!=s[j]:
s[j]='?'
print("The Intermediate Hypothesis:\n",s)
print("The Final MAximal Specific ypothesis:\n",s)
print("Read the set of instances from csv file to assign labels:")
with open('new-training-data.csv','r') as file1:
reader=csv.reader(file1)
x2=[row for row in reader]
new=np.array(x2)
r1,c1=x.shape
print("the sixe of new sample is ",r1,"rows and",c1,"cols")
print("New data",new)
print("\n\n")
new1=new.tolist()
for i in range(0,r1):
flag=1
for j in range(0,c1-1):
if s[j]!='?' and s[j]!=new1[i][j]:
flag=0
break
if flag==1:
new1[i].append("Yes")
else:
new1[i].append("No")
print("The assigned labels for set of instances read from csv file are:")
print("\n",new1)
#lab_2
import csv
import numpy as np
else:
for j in range(c-1):
g1 = np.empty(c-1, dtype=object)
if s[j] == '?':
continue
elif x[i,j] != s[j]:
for l in range(c-1): g1[l] = "?"
g1[j] = s[j]
g = np.vstack([g, g1])
g = np.delete(g, (0), axis=0)
print("\nIntermediate general hypothesis\n",g)
nr,nc=g.shape
print("Enter new instance value")
new=[]
new = np.empty(c-1, dtype=object)
print("ENTER ATTRIBUTE VALUES for SKY,AIRTEMP,HUMIDITY,WIND,WATER, and FORECAST")
for i in range(c-1):
new[i]=input("enter attribute values")
print("\nthe given instance is",new)
flag=1
for i in range(c-1):
if s[i]!='?' and s[i]!=new[i]:
flag=0
break
for i in range(nr):
flag1=1
for j in range(nc-1):
if g[i][j]!='?' and g[i][j]!=new[j]:
flag1=0
break
if flag1==1:
break
print("\n The assigned label for ENJOYING WATER SPORT is")
if flag==1 or flag1==1:
print("YES")
else:
print("NO")
#lab_3_naive_bayesian
import csv
print('CONSTUCTION OF NAIVE BAYESIAN MODEL (Calculation of Probalities)')
with open('playtennis.txt') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count=yes_count=0
sunny_yes=sunny_no=0
rainy_yes=rainy_no=0
overcast_yes=overcast_no=0
hot_yes=hot_no=0
cool_yes=cool_no=0
mild_yes=mild_no=0
high_yes=high_no=0
normal_yes=normal_no=0
weak_yes=weak_no=0
strong_yes=strong_no=0
for row in csv_reader:
line_count = line_count + 1
if(row[5]=="yes"):
yes_count=yes_count+1
if(row[1]=='Sunny' and row[5]=="yes"):
sunny_yes=sunny_yes+1
if (row[1] == 'Sunny' and row[5] == "no"):
sunny_no = sunny_no + 1
if(row[1]=='Rainy' and row[5]=="yes"):
rainy_yes=rainy_yes+1
if (row[1] == 'Rainy' and row[5] == "no"):
rainy_no = rainy_no + 1
if(row[1]=='Overcast' and row[5]=="yes"):
overcast_yes=overcast_yes+1
if (row[1] == 'Overcast' and row[5] == "no"):
overcast_no = overcast_no + 1
if(row[2]=='Hot' and row[5]=="yes"):
hot_yes=hot_yes+1
if (row[2] == 'Hot' and row[5] == "no"):
hot_no = hot_no + 1
if(row[2]=='Cool' and row[5]=="yes"):
cool_yes=cool_yes+1
if (row[2] == 'Cool' and row[5] == "no"):
cool_no = cool_no + 1
if(row[2]=='Mild' and row[5]=="yes"):
mild_yes=mild_yes+1
if (row[2] == 'Mild' and row[5] == "no"):
mild_no = mild_no + 1
if(row[3]=='High' and row[5]=="yes"):
high_yes=high_yes+1
if (row[3] == 'High' and row[5] == "no"):
high_no = high_no + 1
if(row[3]=='Normal' and row[5]=="yes"):
normal_yes=normal_yes+1
if (row[3] == 'Normal' and row[5] == "no"):
normal_no = normal_no + 1
if(row[4]=='Weak' and row[5]=="yes"):
weak_yes=weak_yes+1
if (row[4] == 'Weak' and row[5] == "no"):
weak_no = weak_no + 1
if (row[4] == 'Strong' and row[5] == "yes"):
strong_yes=strong_yes+1
if (row[4] == 'Strong' and row[5] == "no"):
strong_no = strong_no + 1
no_count = line_count - yes_count
p_playtennis_yes=yes_count/line_count
p_playtennis_no=no_count/line_count
p_sunny_yes=sunny_yes/yes_count
p_sunny_no=sunny_no/no_count
p_rainy_yes=rainy_yes/yes_count
p_rainy_no=rainy_no/no_count
p_overcast_yes=overcast_yes/yes_count
p_overcast_no=overcast_no/no_count
p_hot_yes=hot_yes/yes_count
p_hot_no=hot_no/no_count
p_cool_yes=cool_yes/yes_count
p_cool_no=cool_no/no_count
p_mild_yes=mild_yes/yes_count
p_mild_no=mild_no/no_count
p_high_yes=high_yes/yes_count
p_high_no=high_no/no_count
p_normal_yes=normal_yes/yes_count
p_normal_no=normal_no/no_count
p_weak_yes=weak_yes/yes_count
p_weak_no=weak_no/no_count
p_strong_yes=strong_yes/yes_count
p_strong_no=strong_no/no_count
print()
print("probability of Play_Tennis with label YES:",p_playtennis_yes)
print("probability of Play_Tennis with label NO:",p_playtennis_no)
print()
print('Probalitiies for OUTLOOK attribute')
print("probability of outlook is sunny with label yes:",p_sunny_yes)
print("probability of outlook is sunny with label NO",p_sunny_no)
print("probability of outlook is Rainy with label yes:",p_rainy_yes)
print("probability of outlook is Rainy with label NO",p_rainy_no)
print("probability of outlook is overcast with label yes:",p_overcast_yes)
print("probability of outlook is overcast with label NO",p_overcast_no)
print()
print('Probalitiies for Temperature attribute')
print("probability of Temperature is hot with label yes:",p_hot_yes)
print("probability of Temperature is hot with label NO",p_hot_no)
print("probability of Temperature is cool with label yes:",p_cool_yes)
print("probability of Temperature is cool with label NO",p_cool_no)
print("probability of Temperature is mild with label yes:",p_mild_yes)
print("probability of Temperature is mild with label NO",p_mild_no)
print()
print('Probalitiies for HUMIDITY attribute')
print("probability of Humidity is high with label yes:",p_high_yes)
print("probability of Humidity is high with label NO",p_high_no)
print("probability of Humidity is normal with label yes:",p_normal_yes)
print("probability of Humidity is normal with label NO",p_normal_no)
print()
print('Probalitiies for WIND attribute')
print("probability of WIND is weak with label yes:",p_weak_yes)
print("probability of WIND is weak with label NO",p_weak_no)
print("probability of WIND is strong with label yes:",p_strong_yes)
print("probability of WIND is strong with label NO",p_strong_no)
print()
print('Assigning label based Naive BAyesian classifier')
rec=str(input("Enter OUTLOOK(x1), TEMPERATURE(x2), HUMIDITY(x3), WIND(x4) in this
sequence separated by comma "))
reclist=rec.split(",")
with open('playtennis.txt') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count=yes_count=0
x1_yes=x1_no=0
x2_yes=x2_no=0
x3_yes=x3_no=0
x4_yes=x4_no=0
for row in csv_reader:
line_count = line_count + 1
if(row[5]=="yes"):
yes_count=yes_count+1
if(row[1]==reclist[0] and row[5]=="yes"):
x1_yes=x1_yes+1
if (row[1] == reclist[0] and row[5] == "no"):
x1_no = x1_no + 1
if (row[2] == reclist[1] and row[5] == "yes"):
x2_yes = x2_yes + 1
if (row[2] == reclist[1] and row[5] == "no"):
x2_no = x2_no + 1
if (row[3] == reclist[2] and row[5] == "yes"):
x3_yes = x3_yes + 1
if (row[3] == reclist[2] and row[5] == "no"):
x3_no = x3_no + 1
if (row[4] == reclist[3] and row[5] == "yes"):
x4_yes = x4_yes + 1
if (row[4] == reclist[3] and row[5] == "no"):
x4_no = x4_no + 1
no_count = line_count - yes_count
x1_yes=x1_yes/yes_count
x1_no=x1_no/no_count
x2_yes=x2_yes/yes_count
x2_no=x2_no/no_count
x3_yes=x3_yes/yes_count
x3_no=x3_no/no_count
x4_yes=x4_yes/yes_count
x4_no=x4_no/no_count
yes_x=x1_yes*x2_yes*x3_yes*x4_yes*yes_count/line_count
no_x=x1_no*x2_no*x3_no*x4_no*no_count/line_count
if(yes_x>no_x):
print(f'The playing tennis {reclist[0]}, {reclist[1]}, {reclist[2]},
{reclist[3]} is allowed means assigned label is YES')
else:
print(f'The playing tennis {reclist[0]}, {reclist[1]}, {reclist[2]},
{reclist[3]} is not possible means assigned label is NO')
#lab_4_naive_bayesian_sk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
data=pd.read_csv("play_tennis.csv")
print("The given data set is:")
print(data)
predictors=data.iloc[:,0:4]
target=data.iloc[:,4]
predictors_train,predictors_test,target_train,target_test=train_test_split(predictors,t
arget,test_size=0.3,random_state=123)
gnb=GaussianNB()
model=gnb.fit(predictors_train,target_train)
prediction=model.predict(predictors_test)
print("ACCURACY of Classifier")
accuracy_score(target_test,prediction,normalize=True)
#lab_5_dtc
import pandas as pd
import numpy as np
from pprint import pprint
dataset = pd.read_csv('play_tennisDTC1.csv',
names=['Outlook','Temperature','humidity','Wind','PlayTennis'])
#Construction of decision tree
def entropy(target_col):
elements,counts = np.unique(target_col,return_counts = True)
entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for
i in range(len(elements))])
return entropy
def InfoGain(data,split_attribute_name,target_name="PlayTennis"):
if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return
np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_
attribute_name],return_counts=True)[1])]
else:
#Set the default value for this node --> The mode target feature value of the
current node
parent_node_class =
np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],
return_counts=True)[1])]
#Create the tree structure. The root gets the name of the feature
(best_feature) with the maximum information
#gain in the first run
tree = {best_feature:{}}
#Remove the feature with the best inforamtion gain from the feature space
features = [i for i in features if i != best_feature]
#Grow a branch under the root node for each possible value of the root node
feature
subtree =
ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
print("\n ")
#Add the sub tree, grown from the sub_dataset to the tree under the root
node
tree[best_feature][value] = subtree
return(tree)
def predict(query,tree,default = 1):
for key in list(query.keys()):
if key in list(tree.keys()):
try:
result = tree[key][query[key]]
except:
return default
result = tree[key][query[key]]
if isinstance(result,dict):
return predict(query,result)
else:
return result
def train_test_split(dataset):
training_data = dataset.iloc[:14].reset_index(drop=True)
#We drop the index respectively relabel the index
#starting form 0, because we do not want to run into errors regarding the row
labels / indexes
testing_data = dataset.iloc[14:].reset_index(drop=True)
return training_data,testing_data
training_data,testing_data = train_test_split(dataset)
def test(data,tree):
#Create new query instances by simply removing the target feature column from the
original dataset and
#convert it to a dictionary
queries = data.iloc[:,:-1].to_dict(orient = "records")
predicted = pd.DataFrame(columns=["predicted"])
for i in range(len(data)):
predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
print('\n The prediction accuracy is: ',(np.sum(predicted["predicted"] ==
data["PlayTennis"])/len(data))*100,'%')
tree = ID3(training_data,training_data,training_data.columns[:-1])
print("The final Resultant Decision Tree")
pprint(tree)
test(testing_data,tree)
#Assigning a label
dataset1 = pd.read_csv('newsamples.csv',
names=['Outlook','Temperature','humidity','Wind','PlayTennis'])
print()
print()
def assign(data1,tree):
queries = data1.iloc[:,:-1].to_dict(orient = "records")
print('the given samples to assign a label are')
print(queries)
print()
predicted = pd.DataFrame(columns=["predicted"])
print('predicted labels for each new sample are')
for i in range(len(data1)):
predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
print(predicted)
assign(dataset1,tree)
#lab_7_knn
# K-Nearest neighbor implementation
import math
return neighbors
dataset=[];
n=int(input('enter no of data points'));
n1=int(input('enter no of dimensions in a dataset along with class label attribute'));
print('attr1,aatr2,.....attrn(class label)')
for i in range(n):
print('enter attribute values for point',i);
dataset.append([]);
counter=0;
while counter<n1:
b=float(input('enter value '));
counter+=1;
dataset[i].append(b);
print('Given points in data set are')
print(dataset);
print('Enter a new data point to assign a label')
p=[];
i=0
n=n1-1
while i<n:
k=float(input('enter data point elements'))
p.append(k)
i=i+1
print('Enter K no.of neighbors')
k=int(input())
prediction = KNN_classification(dataset,p, k)
print()
print('The assigned label for the given data point is is',prediction)
#lab_8a_knn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
data=pd.read_csv('iris_dataset_ann.csv')
predictors=data.iloc[:,0:4]
print('Features of training dataset\n', predictors)
print('\n')
target=data.iloc[:,4]
print('Class of IRIS flower \n', target)
print('\n')
predictors_train,predictors_test,target_train,target_test =
train_test_split(predictors,target,test_size=0.3,random_state=None)
knn=KNeighborsClassifier(n_neighbors=5)
model=knn.fit(predictors_train,target_train)
accuracy=knn.score(predictors_test,target_test)
print('Accuracy of KNN classifier on IRIS data set',accuracy)
#lab_8b_svm
#SVM Classifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
data=pd.read_csv('iris_dataset_ann.csv')
print('Features supplied for classification')
predictors=data.iloc[:,0:4]
print('\n',predictors)
print('Target class labels')
target=data.iloc[:,4]
print('\n',target)
predictors_train,predictors_test,target_train,target_test=train_test_split(predictors,t
arget,test_size=0.3,random_state=123)
svm=svm.SVC(kernel='linear')
model=svm.fit(predictors_train,target_train)
prediction=svm.predict(predictors_test)
print('Accuracy of SVM classifier',
accuracy_score(target_test,prediction,normalize=True))
#lab_9_Kmeans_clustering
import math
import sys
l=[]
for i in range(m):
k=input("enter a data point with "+str(n)+" dimensions: ").split()
k=[int(i) for i in k]
if(len(k)==n):
l=l+[k]
else :
print("Data point with incorrect dimensions...!")
sys.exit()
print('\n\nThe given data points are',l)
for i in range(m):
for j in range(c):
t=0
for k in range(n):
t=t+(l[i][k]-g[j][k])*(l[i][k]-g[j][k])
t=math.sqrt(t)
dist[i][j]=t
print("\nDistance matrix after 1st iteration: ")
print(dist)
for j in range(c):
for k in range(n):
g[j][k]/=t[j]
print("\nCluster Membership matrix after 1st iteration: ")
print(grp)
print("\nupdated centeriod matrix after 1st iteration:")
print(g)
while(grp1!=grp) :
for i in range(m):
for j in range(c):
t=0
for k in range(n):
t=t+(l[i][k]-g[j][k])*(l[i][k]-g[j][k])
t=math.sqrt(t)
dist[i][j]=t
print("\nDistance matrix in next iteration: ")
print(dist)
for j in range(c):
for k in range(n):
g[j][k]/=t[j]
print("\nCluster Membership after next iteration: ")
print(grp)
print("\n\n\nUpdated centroid matrix after next iteration:")
print(g)
#lab_10_Linear_regression
#Linear Regression implementation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# mean of x and y
mean_x = np.mean(x)
mean_y = np.mean(y)
print('\n mean of salary is',mean_x)
print('\n mean of loan amount is',mean_y)
print('\n')
cov=0
for i in range(n):
x_d=x[i]-mean_x
y_d=y[i]-mean_y
xy=x_d * y_d
cov=cov+xy
var=0
for i in range(n):
x_d=x[i]-mean_x
xx= x_d * x_d
var=var+xx
return (b0,b1)
# putting labels
plt.xlabel('Salary in lakhs')
plt.ylabel('Loan amount in Lakhs')
b = estimate_coef(sal, loanamount)
print("Estimated coefficients:\n b0 = {} \
\n b1 = {}".format(b[0], b[1]))
print('\n')