New ML Lab Manual
New ML Lab Manual
SPECIFIC HYPOTHESIS BASED ON A GIVEN SET OF TRAINING DATA SAMPLES. READ THE
TRAINING DATA FROM A .CSV FILE.
program1.py
import csv
with open('program1.csv','r') as f:
reader = csv.reader(f)
your_list = list(reader)
h = [['0','0','0','0','0','0']]
for i in your_list:
print(i)
if i[-1]=="True":
j = 0
for x in i:
if x != "True":
if x != h[0][j] and h[0][j] == '0':
h[0][j] = x
elif x != h[0][j] and h[0][j] != '0':
h[0][j] = '?'
else:
pass
j = j+1
print("most specific hypothesis is")
print(h)
program1.csv
Sky,AirTemp,Humidity,Wind,Water,Forecast,EnjoySport
Sunny,Warm,Normal,Strong,Warm,Same,True
Sunny,Warm,High,Strong,Warm,Same,True
Rainy,Cold,High,Strong,Warm,Change,False
Sunny,Warm,High,Strong,Cool,Change,True
#OUTPUT FOR PROGRAM-1
program2.py
import pandas as pd
df=pd.read_csv("program2.csv")
spe_df=df.loc[df["enjoysport"].str.upper()=="YES"]
gene_df=df.loc[df["enjoysport"].str.upper()=="NO"]
spe_df=spe_df.iloc[:,:-1]
gene_df=gene_df.iloc[:,:-1]
base=spe_df.iloc[0]
for x in range(1,len(spe_df)):
base=base.where(spe_df.iloc[x]==base,other="???")
print("Specific:-\n",base.values)
for x in range(len(gene_df)):
base=base.where(base!=gene_df.iloc[x],other="???")
print("General")
for i,x in enumerate(base):
if x!="???":
l=["???"]*len(base)
l[i]=x
print(l)
program2.csv
sky,airtemp,humidity,wind,water,forecast,enjoysport
sunny,warm,normal,strong,warm,same,YES
sunny,warm,high,strong,warm,same,YES
rainy,cold,high,strong,warm,change,NO
sunny,warm,high,strong,cool,change,YES
Specific:-
['sunny' 'warm' '???' 'strong' '???' '???']
General
['sunny', '???', '???', '???', '???', '???']
['???', 'warm', '???', '???', '???', '???']
#PROGRAM-3:Write a program to demonstrate the working of the decision tree
based ID3 algorithm. Use an appropriate data set for building the decision tree
and apply this knowledge to classify a new sample.
program3.py
import pandas as pd
import numpy as np
dataset=pd.read_csv('playtennis.csv',names=['outlook','temperature','humidity','w
ind','class',])
attributes=('Outlook','Temperature','Humidity','Wind','PlayTennis')
def entropy(target_col):
elements,counts=np.unique(target_col,return_counts=True)
entropy=np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))
for i in range(len(elements))])
return entropy
def InfoGain(data,split_attribute_name,target_name="class"):
total_entropy=entropy(data[target_name])
vals,counts=np.unique(data[split_attribute_name],return_counts=True)
Weighted_entropy=np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[s
plit_attribute_name]==vals[i]).dropna()[target_name]) for i in
range(len(vals))])
Information_Gain=total_entropy-Weighted_entropy
return Information_Gain
def
ID3(data,originaldata,features,target_attribute_name="class",parent_node_class=No
ne):
if len(np.unique(data[target_attribute_name]))<=1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return
np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(origin
aldata[target_attibute_name],return_counts=True)[1])]
elif len(features)==0:
return parent_node_class
else:
parent_node_class=np.unique(data[target_attribute_name])[np.argmax(np.uni
que(data[target_attribute_name],return_counts=True)[1])]
item_values=[InfoGain(data,feature,target_attribute_name) for feature
in features]
best_feature_index=np.argmax(item_values)
best_feature=features[best_feature_index]
tree={best_feature:{}}
features=[i for i in features if i!=best_feature]
for value in np.unique(data[best_feature]):
value=value
sub_data=data.where(data[best_feature]==value).dropna()
subtree=ID3(sub_data,dataset,features,target_attribute_name,parent_no
de_class)
tree[best_feature][value]=subtree
return(tree)
def predict(query,tree,default=1):
for key in list(query.keys()):
if key in list(tree.keys()):
try:
result=tree[key][query[key]]
except:
return default
result=tree[key][query[key]]
if isinstance(result,dict):
return predict(query,result)
else:
return result
def train_test_split(dataset):
training_data=dataset.iloc[:14].reset_index(drop=True)
return training_data
def test(data,tree):
queries=data.iloc[:,:-1].to_dict(orient="records")
predicted=pd.DataFrame(columns=["predicted"])
for i in range(len(data)):
predicted.loc[i,"predicted"]=predict(queries[i],tree,1.0)
XX=train_test_split(dataset)
training_data=XX
tree=ID3(training_data,training_data,training_data.columns[:-1])
print('\nDisplay Tree\n',tree)
print('len=',len(training_data))
test(training_data,tree)
playtennis.csv
Outlook,Temperature,Humidity,Windy,PlayTennis
Sunny,Hot,High,Weak,No
Sunny,Hot,High,Strong,No
Overcast,Hot,High,Weak,Yes
Rain,Mild,High,Weak,Yes
Rain,Cool,Normal,Weak,Yes
Rain,Cool,Normal,Strong,No
Overcast,Cool,Normal,Strong,Yes
Sunny,Mild,High,Weak,No
Sunny,Cool,Normal,Weak,Yes
Rain,Mild,Normal,Weak,Yes
Sunny,Mild,Normal,Strong,Yes
Overcast,Mild,High,Strong,Yes
Overcast,Hot,Normal,Weak,Yes
Rain,Mild,High,Strong,No
#OUTPUT FOR PORGRAM-3
Display Tree
{'outlook': {'Outlook': 'PlayTennis', 'Overcast': 'Yes', 'Rain': {'wind':
{'Strong': 'No', 'Weak': 'Yes'}}, 'Sunny': {'humidity': {'High': 'No',
'Normal': 'Yes'}}}}
len= 14
The predicted accuracy is: 100.0 %
#PROGRAM-4:Build an Artificial Neural Network by implementing the
Backpropagation algorithm and test the same using appropriate data sets.
program4.py
import numpy as np
x = np.array(([2,9],[1,5],[3,6]),dtype=float)
y = np.array(([92],[86],[89]),dtype=float)
x = x/np.amax(x,axis=0)
y = y/100
def derivatives_sigmoid(x):
return x * (1 - x)
epoch=5000
lr=0.1
inputlayer_neurons = 2
hiddenlayer_neurons = 3
output_neurons = 1
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
hinp1=np.dot(x,wh)
hinp=hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+ bout
output = sigmoid(outinp)
E0 = y-output
outgrad =derivatives_sigmoid(output)
d_output = E0* outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act)
d_hiddenlayer = EH * hiddengrad
wout += hlayer_act.T.dot(d_output) *lr
wh += x.T.dot(d_hiddenlayer) *lr
print("Input:\n" + str(x))
print("Actual Output: \n" + str(y))
print("PredictedOutput: \n",output)
#OUTPUT FOR PROGRAM-4
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
PredictedOutput:
[[0.86284816]
[0.85062962]
[0.86425164]]
#PROGRAM-5:Write a program to implement the naïve Bayesian classifier for a
sample training data set stored as a .CSV file. Compute the accuracy of the
classifier, considering few test data sets.
program5.py
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
data = pd.read_csv('playtennis.csv')
print("The first 5 values of data is :\n",data.head())
x = data.iloc[:,:-1]
print("\nThe first 5 values of train data is\n",x.head())
y = data.iloc[:,-1]
print("\nThe first 5 values of train output is\n",y.head())
le_Outlook = LabelEncoder()
x.Outlook = le_Outlook.fit_transform(x.Outlook)
le_Temperature=LabelEncoder()
x.Temperature=le_Temperature.fit_transform(x.Temperature)
le_Humidity=LabelEncoder()
x.Humidity=le_Humidity.fit_transform(x.Humidity)
le_Windy=LabelEncoder()
x.Windy=le_Windy.fit_transform(x.Windy)
le_PlayTennis=LabelEncoder()
y=le_PlayTennis.fit_transform(y)
print("\nNow the train output is \n",y)
classifier=GaussianNB()
classifier.fit(x_train,y_train)
#DATASET PROGRAM-5
playtennis.csv
Outlook,Temperature,Humidity,Windy,PlayTennis
Sunny,Hot,High,Weak,No
Sunny,Hot,High,Strong,No
Overcast,Hot,High,Weak,Yes
Rain,Mild,High,Weak,Yes
Rain,Cool,Normal,Weak,Yes
Rain,Cool,Normal,Strong,No
Overcast,Cool,Normal,Strong,Yes
Sunny,Mild,High,Weak,No
Sunny,Cool,Normal,Weak,Yes
Rain,Mild,Normal,Weak,Yes
Sunny,Mild,Normal,Strong,Yes
Overcast,Mild,High,Strong,Yes
Overcast,Hot,Normal,Weak,Yes
Rain,Mild,High,Strong,No
program6.py
import pandas as pd
msg=pd.read_csv('prog6.csv',names=['message','label'])
print('The dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
x=msg.message
y=msg.labelnum
print(x)
print(y)
prog6.csv
program7.py
import pandas as pd
data=pd.read_csv("program7.csv")
heart_disease=pd.DataFrame(data)
print(heart_disease)
q=HeartDisease_infer.query(variables=['heartdisease'],evidence={
'age':int(input('Enter age:')),
'Gender':int(input('Enter Gender:')),
'Family':int(input('Enter Family History:')),
'diet':int(input('Enter Diet:')),
'Lifestyle':int(input('Enter Lifestyle:')),
'cholestrol':int(input('Enter cholestrol:'))
})
print(q)
#DATASET OF PROGRAM-7:
program7.csv
age,Gender,Family,diet,Lifestyle,cholestrol,heartdisease
0,0,1,1,3,0,1
0,1,1,1,3,0,1
1,0,0,0,2,1,1
4,0,1,1,3,2,0
3,1,1,0,0,2,0
2,0,1,1,1,0,1
4,0,1,0,2,0,1
0,0,1,1,3,0,1
3,1,1,0,0,2,0
1,1,0,0,0,2,1
4,1,0,1,2,0,1
4,0,1,1,3,2,0
2,1,0,0,0,0,0
2,0,1,1,1,0,1
3,1,1,0,0,1,0
0,0,1,0,0,2,1
1,1,0,1,2,1,1
3,1,1,1,0,1,0
4,0,1,1,3,2,0
program8.py
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import pandas as pd
X=pd.read_csv("program8.csv")
x1 = X['V1'].values
x2 = X['V2'].values
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
plt.plot()
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Dataset')
plt.scatter(x1, x2)
plt.show()
gmm = GaussianMixture(n_components=3)
gmm.fit(X)
em_predictions = gmm.predict(X)
print("\nEM predictions")
print(em_predictions)
print("mean:\n",gmm.means_)
print('\n')
print("Covariances\n",gmm.covariances_)
print(X)
plt.title('Exceptation Maximum')
plt.scatter(X[:,0], X[:,1],c=em_predictions,s=50)
plt.show()
V1,V2
2.072345,-3.24169
17.93671,15.78481
1.083576,7.319176
11.12067,14.40678
23.71155,2.557729
24.16993,32.02478
21.66578,4.892855
4.693684,12.34217
19.21191,-1.12137
4.230391,-4.44154
9.12713,23.60572
0.407503,15.29705
7.314846,3.309312
-3.4384,-12.0253
17.63935,-3.21235
4.415292,22.81555
11.94122,8.122487
0.725853,1.806819
8.815273,28.1326
-5.77359,1.0248
18.76943,24.16946
#OUTPUT FOR PROGRAM-8:
#PROGRAM-9:Write a program to implement k-Nearest Neighbour algorithm to
classify the iris data set. Print both correct and wrong predictions.
Java/Python ML library classes can be used for this problem.
program9.py
classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
print('Confusion matrix is as follows')
print(confusion_matrix(y_test,y_pred))
print('Accuracy Metrics')
print(classification_report(y_test,y_pred))
accuracy 0.95 38
macro avg 0.94 0.94 0.94 38
weighted avg 0.95 0.95 0.95 38
#PROGRAM-10:Implement the non-parametric Locally Weighted Regressionalgorithm
in order to fit data points. Select appropriate data set for your experiment
and draw graphs.
program10.py
residuals = y - yest
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
return yest
import math
n = 100
x = np.linspace(0, 2 * math.pi, n)
y = np.sin(x) + 0.3 * np.random.randn(n)
f =0.25
iterations=3
yest = lowess(x, y, f, iterations)