ML Lab
ML Lab
set of training data samples. Read the training data from a .CSV file.
Data set:
PROGRAM:
import csv
num_attributes = 6
a = []
print("\n The Given Training Data Set \n")
with open('enjoyysports.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
a.append (row)
print(row)
print("\n The initial value of hypothesis: ")
hypothesis = ['0'] * num_attributes
print(hypothesis)
for j in range(0,num_attributes):
hypothesis[j] = a[0][j];
print("\n Find S: Finding a Maximally Specific Hypothesis\n")
for i in range(0,len(a)):
if a[i][num_attributes]=='yes':
for j in range(0,num_attributes):
if a[i][j]!=hypothesis[j]: hypothesis[j]='?'
else :
hypothesis[j]= a[i][j]
print(" For Training instance No:{0} the hypothesis is".format(i),hypothesis)
print("\n The Maximally Specific Hypothesis for a given Training Examples :\n")
print(hypothesis)
OUTPUT:
'yes']
['sunny', 'warm', 'high', 'strong', 'warm', 'same', 'yes']
['rainy', 'cold', 'high', 'strong', 'warm', 'change', 'no']
['sunny', 'warm', 'high', 'strong', 'cool', 'change', 'yes'] The initial value of hypothesis:
['0', '0', '0', '0', '0', '0']
Find S: Finding a Maximally Specific Hypothesis
For Training instance No: 0 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same']
For Training instance No: 1 the hypothesis is ['sunny', 'warm', '?', 'strong', 'warm', 'same']
For Training instance No: 2 the hypothesis is ['sunny', 'warm', '?', 'strong', 'warm', 'same']
For Training instance No: 3 the hypothesis is ['sunny', 'warm', '?', 'strong', '?', '?'] The Maximally Specific
Hypothesis for a given Training Examples:
['sunny', 'warm', '?', 'strong', '?', '?']
Experiment-2
AIM: For a given set of training data examples stored in a .CSV file, implement and demonstrate the
Candidate-Elimination algorithm to output a description of the set of all hypotheses consistent with the training
examples.
Data set:
PROGRAM:
import numpy as np
import pandas as pd
data = pd.DataFrame(data=pd.read_csv('enjoyysports.csv'))
concepts = np.array(data.iloc[:, 0:-1])
print(concepts)
target = np.array(data.iloc[:, -1])
print(target)
def learn(concepts, target):
specific_h = concepts[0].copy()
print("Initialization of specific_h and general_h")
print(specific_h)
general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
print(general_h)
for i, h in enumerate(concepts):
if target[i] == "yes":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
ASTC MACHINE LEARNING - LAB (R20) ROLL NO:
specific_h[x] = '?'
general_h[x][x] = '?'
print("Specific Hypothesis after step", i + 1)
print(specific_h)
print("General Hypothesis after step", i + 1)
print(general_h)
if target[i] == "no":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
general_h[x][x] = specific_h[x]
else:
general_h[x][x] = '?'
indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
for i in indices:
general_h.remove(['?', '?', '?', '?', '?', '?'])
return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("Final Specific_h:", s_final, sep="\n")
print("Final General_h:", g_final, sep="\n")
OUTPUT:
[['sunny' 'warm' 'normal' 'strong' 'warm' 'same']
['sunny' 'warm' 'high' 'strong' 'warm' 'same']
['rainy' 'cold' 'high' 'strong' 'warm' 'change']
['sunny' 'warm' 'high' 'strong' 'cool' 'change']]
['yes' 'yes' 'no' 'yes']
Initialization of specific_h and general_h
['sunny' 'warm' 'normal' 'strong' 'warm' 'same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Specific Hypothesis after step 1
['sunny' 'warm' 'normal' 'strong' 'warm' 'same']
General Hypothesis after step 1
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Specific Hypothesis after step 2
['sunny' 'warm' '?' 'strong' 'warm' 'same']
General Hypothesis after step 2
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Specific Hypothesis after step 4
['sunny' 'warm' '?' 'strong' '?' '?']
General Hypothesis after step 4
[['sunny', '?', '?', '?', '?', '?'], ['?', 'warm', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Final Specific_h:
['sunny' 'warm' '?' 'strong' '?' '?']
Final General_h:
[['sunny', '?', '?', '?', '?', '?'], ['?', 'warm', '?', '?', '?', '?']]
Experiment-3
AIM: Write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an
appropriate data set for building the decision tree and apply this knowledge to classify a new sample .
Data set:
PROGRAM:
import numpy as np
import math
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
headers = next(datareader)
metadata = []
traindata = []
for name in headers:
metadata.append(name)
for row in datareader:
traindata.append(row)
return (metadata, traindata)
class Node:
def _init_(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def _str_(self):
return self.attribute
def subtables(data, col, delete):
dict = {}
items = np.unique(data[:, col])
count = np.zeros((items.shape[0], 1), dtype=np.int32)
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
items = np.unique(S)
if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
for count in counts:
sums += -1 * count * math.log(count, 2)
return sums
def gain_ratio(data, col):
items, dict = subtables(data, col, delete=False)
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1))
intrinsic = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1])
iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
node.answer = np.unique(data[:, -1])[0]
return node
gains = np.zeros((data.shape[1] - 1, 1))
for col in range(data.shape[1] - 1):
gains[col] = gain_ratio(data, col)
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
def print_tree(node, level):
if node.answer != "":
print(empty(level), node.answer)
return
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
metadata, traindata = read_data("ID3.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)
OUTPUT:
outlook
overcast b'yes'
rainy
windy
b'strong' b'no'
b'weak'
b'yes'
sunny humidity
b'high' b'no'
b'normal'
b'yes'
Experiment-4
AIM: Exercises to solve the real-world problems using the following machine learning methods:
a) Linear Regression
b) Logistic Regression
c) Binary Classifier
a) Linear Regression:
Linear regression is probably one of the most important and widely used regression techniques. It’s among the
simplest regression methods. One of its main advantages is the ease of interpreting results.
PROGRAM:
b) Logistic Regression:
It’s a classification algorithm that is used where the target variable is of categorical nature. The main objective
behind Logistic Regression is to determine the relationship between features and the probability of a particular
outcome.
Data Set:
PROGRAM:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
dataset = pd.read_csv("User_Data.csv")
x = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)
print (xtrain[0:10, :])
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(xtrain, ytrain)
y_pred = classifier.predict(xtest)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
print ("Confusion Matrix : \n", cm)
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(ytest, y_pred))
from matplotlib.colors import ListedColormap
X_set, y_set = xtest, ytest
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1,
stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Classifier (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
1.02601437]
[-0.28717375 0.70708966]
[-1.26182405 0.4720925 ]
[-0.40900504 -0.49727077]
[-0.28717375 -0.0566511 ]
[ 0.32198269 -1.23163688]
[ 0.68747655 0.14897141]
[ 0.32198269 2.6458162 ]
[ 1.90578942 -0.99663973]
[-0.40900504 -0.23289897]]
Confusion Matrix :
[[4 0]
[0 1]]
Accuracy : 1.0
c) Binary Classifier:
In machine learning, binary classification is a supervised learning algorithm that categorizes new observations
into one of two classes.
If the model successfully predicts the patients as positive, this case is called True Positive (TP). If the model
successfully predicts patients as negative, this is called True Negative (TN).
The binary classifier may misdiagnose some patients as well. If a diseased patient is classified as healthy by a
negative test result, this error is called False Negative (FN).
Similarly, If a healthy patient is classified as diseased by a positive test result, this error is called False Positive
(FP).
PROGRAM:
OUTPUT:
(1000, 2) (1000,)
Counter({0: 500, 1: 500})
[-3.05837272 4.48825769] 0
[-8.60973869 -3.72714879] 1
[1.37129721 5.23107449] 0
[-9.33917563 -2.9544469 ] 1
[-11.57178593 -3.85275513] 1
[-11.42257341 -4.85679127] 1
[-10.44518578 -3.76476563] 1
[-10.44603561 -3.26065964] 1
[-0.61947075 3.48804983] 0
[-10.91115591 -4.5772537] 1