Machine Learning practical file
Machine Learning practical file
Implement and demonstrate the FIND-S algorithm for Finding the most specific
hypothesis based on a given set of training data samples. Read the training data
from a .CSV file.
Program:
import csv
num_attributes = 6
a = []
print("\n The Given Training Data Set \n")
with open('enjoysport.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
a.append (row)
print(row)
print("\n The initial value of hypothesis: ")
hypothesis = ['0'] * num_attributes
print(hypothesis)
for j in range(0,num_attributes):
hypothesis[j] = a[0][j];
print("\n Find S: Finding a Maximally Specific Hypothesis\n")
for i in range(0,len(a)):
if a[i][num_attributes]=='yes':
for j in range(0,num_attributes):
if a[i][j]!=hypothesis[j]:
hypothesis[j]='?'
else :
hypothesis[j]= a[i][j]
print(" For Training instance No:{0} the hypothesis is
".format(i),hypothesis)
print("\n The Maximally Specific Hypothesis for a given Training Examples :\n")
print(hypothesis)
2. For a given set of a training data examples store in a .CSV file,
implement and demonstrate the candidate elimination algorithm output
and description of the set of all hypothesis consistent with the training
examples.
importnumpyas np
import pandas as pd
data=pd.DataFrame(data=pd.read_csv('trainingdata.csv'))
print(data)
concepts=np.array(data.iloc[:,0:-1])
print(concepts)
target=np.array(data.iloc[:,-1])
print(target)
deflearn(concepts, target):
'''
learn() function implements the learning method of the Candidate elimination
algorithm.
Arguments:
concepts - a data frame with all the features
target - a data frame with corresponding output values
'''
# find indices where we have empty rows, meaning those that are unchanged
indices = [ifori, valin enumerate(general_h) ifval== ['?', '?', '?', '?', '?', '?']]
foriin indices:
# remove those rows from general_h
general_h.remove(['?', '?', '?', '?', '?', '?'])
# Return final values
returnspecific_h, general_h
In [6]:
s_final, g_final=learn(concepts, target)
print("\nFinalSpecific_h:", s_final, sep="\n")
print("\nFinalGeneral_h:", g_final, sep="\n")
3. Write a program to demonstrate the working of the decision tree based
ID3 algorithm use an appropriate data sets for building the decision tree and
apply the knowledge to classify new sample.
#Import libraries and read data using read_csv() function. Remove the
target from the data and store attributes in the features variable.
import pandas as pd
import math
import numpy as np
data = pd.read_csv("Dataset/4-dataset.csv")
features = [feat for feat in data]
features.remove("answer")
#Create a class named Node with four members children, value, isLeaf and pred.
class Node:
def __init__(self):
self.children = []
self.value = ""
self.isLeaf = False
self.pred = ""
#Define a function called entropy to find the entropy oof the dataset.
def entropy(examples):
pos = 0.0
neg = 0.0
for _, row in examples.iterrows():
if row["answer"] == "yes":
pos += 1
else:
neg += 1
if pos == 0.0 or neg == 0.0:
return 0.0
else:
p = pos / (pos + neg)
n = neg / (pos + neg)
return -(p * math.log(p, 2) + n * math.log(n, 2))
#Define a function named ID3 to get the decision tree for the given dataset
max_gain = 0
max_feat = ""
for feature in attrs:
#print ("\n",examples)
gain = info_gain(examples, feature)
if gain >max_gain:
max_gain = gain
max_feat = feature
root.value = max_feat
#print ("\nMax feature attr",max_feat)
uniq = np.unique(examples[max_feat])
#print ("\n",uniq)
for u in uniq:
#print ("\n",u)
subdata = examples[examples[max_feat] == u]
#print ("\n",subdata)
if entropy(subdata) == 0.0:
newNode = Node()
newNode.isLeaf = True
newNode.value = u
newNode.pred = np.unique(subdata["answer"])
root.children.append(newNode)
else:
dummyNode = Node()
dummyNode.value = u
new_attrs = attrs.copy()
new_attrs.remove(max_feat)
child = ID3(subdata, new_attrs)
dummyNode.children.append(child)
root.children.append(dummyNode)
return root
#Define a function named printTree to draw the decision tree
import random
from math import exp
from random import seed
# Initialize a network
defactivate(weights, inputs):
activation =weights[-1]
foriin range(len(weights)-1):
activation += weights[i] * inputs[i]
return activation
ifi!=len(network)-1:
for j in range(len(layer)):
error = 0.0
for neuron innetwork[i+ 1]:
error += (neuron['weights'][j] * neuron['delta'])
errors.append(error)
else:
for j in range(len(layer)):
neuron = layer[j]
errors.append(expected[j] - neuron['output'])
for j in range(len(layer)):
neuron = layer[j]
neuron['delta'] = errors[j] *transfer_derivative(neuron['output'])
#Network Initialization
network =initialize_network(n_inputs, 2, n_outputs)
i= 1
for layer in network:
j=1
for sub in layer:
print("\n Layer[%d] Node[%d]:\n" %(i,j),sub)
j=j+1
i=i+1
5. Write a program to implement the Naïve Bayesian classifier for a sample training
data set stored as a csv file compute the accuracy of a classifier considering few
test data set.
import csv
import random
import math
# 1.Data Handling
# 1.1 Loading the Data from csv file of Pima indians diabetes dataset.
defloadcsv(filename):
lines =csv.reader(open(filename, "r"))
dataset = list(lines)
foriin range(len(dataset)):
# converting the attributes from string to floating point numbers
dataset[i] = [float(x) for x in dataset[i]]
return dataset
#2.Summarize Data
#The naive bayes model is comprised of a
#summary of the data in the training dataset.
#This summary is then used when making predictions.
#involves the mean and the standard deviation for each attribute, by class value
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
delsummaries[-1]
return summaries
defsummarizeByClass(dataset):
separated =separateByClass(dataset)
summaries = {}
forclassValue, instances inseparated.items():
summaries[classValue] = summarize(instances)
return summaries
#3.Make Prediction
#3.1 Calculate Probaility Density Function
defcalculateProbability(x, mean, stdev):
exponent =math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) *stdev)) * exponent
#3.2 Calculate Class Probabilities
defcalculateClassProbabilities(summaries, inputVector):
probabilities = {}
forclassValue, classSummariesinsummaries.items():
probabilities[classValue] = 1
foriin range(len(classSummaries)):
mean, stdev=classSummaries[i]
x =inputVector[i]
probabilities[classValue] *=calculateProbability(x, mean, stdev)
return probabilities
#3.3 Prediction : look for the largest probability and return the associated class
defpredict(summaries, inputVector):
probabilities =calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb=None, -1
forclassValue, probability inprobabilities.items():
ifbestLabelisNoneor probability >bestProb:
bestProb= probability
bestLabel=classValue
returnbestLabel
#4.Make Predictions
# Function which return predictions for list of predictions
# For each instance
defgetPredictions(summaries, testSet):
predictions = []
foriin range(len(testSet)):
result =predict(summaries, testSet[i])
predictions.append(result)
return predictions
#Main Function
defmain():
filename = 'C:\\Users\\Dr.Thyagaraju\\Desktop\\Data\\pima-indians-diabetes.csv'
splitRatio= 0.67
dataset =loadcsv(filename)
print("\n The Data Set Splitting into Training and Testing \n")
trainingSet, testSet=splitDataset(dataset, splitRatio)
# prepare model
summaries =summarizeByClass(trainingSet)
print("\n Model Summaries:\n",summaries)
# test model
predictions =getPredictions(summaries, testSet)
print("\nPredictions:\n",predictions)
importpandasaspd
msg=pd.read_csv('document.csv',names=['message','label'])
print("Total Instances of Dataset: ",msg.shape[0])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
fromsklearn.model_selectionimporttrain_test_split
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y)
fromsklearn.feature_extraction.textimportCountVectorizer
count_v=CountVectorizer()
Xtrain_dm=count_v.fit_transform(Xtrain)
Xtest_dm=count_v.transform(Xtest)
fromsklearn.naive_bayesimportMultinomialNB
clf=MultinomialNB()
clf.fit(Xtrain_dm, ytrain)
pred=clf.predict(Xtest_dm)
In [5]:
for doc, p inzip(Xtrain, pred):
p = 'pos' if p == 1 else 'neg'
print("%s -> %s" % (doc, p))
fromsklearn.metricsimportaccuracy_score,confusion_matrix,precision_score,recall_
score
print('Accuracy Metrics: \n')
print('Accuracy: ',accuracy_score(ytest,pred))
print('Recall: ',recall_score(ytest,pred))
print('Precision: ',precision_score(ytest,pred))
print('Confusion Matrix: \n',confusion_matrix(ytest,pred))
7. Write a program to construct a Bayesian network considering medical data
use this model to demonstrate the diagnosis of heart patients using standard
heart disease data set. You can use python ML library API?
import pandas as pd
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
data = pd.read_csv("ds4.csv")
heart_disease = pd.DataFrame(data)
print(heart_disease)
model = BayesianModel([
('age', 'Lifestyle'),
('Gender', 'Lifestyle'),
('Family', 'heartdisease'),
('diet', 'cholestrol'),
('Lifestyle', 'diet'),
('cholestrol', 'heartdisease'),
('diet', 'cholestrol')
])
model.fit(heart_disease, estimator=MaximumLikelihoodEstimator)
HeartDisease_infer = VariableElimination(model)
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={
'age': int(input('Enter Age: ')),
'Gender': int(input('Enter Gender: ')),
'Family': int(input('Enter Family History: ')),
'diet': int(input('Enter Diet: ')),
'Lifestyle': int(input('Enter Lifestyle: ')),
'cholestrol': int(input('Enter Cholestrol: '))
})
print(q)
"""
Output:
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MACHINE LEARNING LAB - 8 ( k-Means Algorithm )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data\n",
"set for clustering using k-Means algorithm. Compare the results of these two\n",
"algorithms and comment on the quality of clustering. You can add Java/Python ML\n",
"library classes/API in the program.**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn import preprocessing\n",
"from sklearn.mixture import GaussianMixture\n",
"from sklearn.datasets import load_iris\n",
"import sklearn.metrics as sm\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dataset=load_iris()\n",
"# print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X=pd.DataFrame(dataset.data)\n",
"X.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']\n",
"y=pd.DataFrame(dataset.target)\n",
"y.columns=['Targets']\n",
"# print(X)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, 'GMM Classification')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png":
"text/plain": [
"<Figure size 1008x504 with 3 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(14,7))\n",
"colormap=np.array(['red','lime','black'])\n",
"\n",
"# REAL PLOT\n",
"plt.subplot(1,3,1)\n",
"plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y.Targets],s=40)\n",
"plt.title('Real')\n",
"\n",
"# K-PLOT\n",
"plt.subplot(1,3,2)\n",
"model=KMeans(n_clusters=3)\n",
"model.fit(X)\n",
"predY=np.choose(model.labels_,[0,1,2]).astype(np.int64)\n",
"plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[predY],s=40)\n",
"plt.title('KMeans')\n",
"\n",
"# GMM PLOT\n",
"scaler=preprocessing.StandardScaler()\n",
"scaler.fit(X)\n",
"xsa=scaler.transform(X)\n",
"xs=pd.DataFrame(xsa,columns=X.columns)\n",
"gmm=GaussianMixture(n_components=3)\n",
"gmm.fit(xs)\n",
"y_cluster_gmm=gmm.predict(xs)\n",
"plt.subplot(1,3,3)\n",
"plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm],s=40)\n",
"plt.title('GMM Classification')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
9. Write a program to implement K-nearest neighbor algorithm to classify the
iris data set. Print both correct and incorrect prediction. Python ML library
classes can be used for this problem?
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MACHINE LEARNING LAB - 9 ( k-Nearest Neighbour Algorithm )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**9. Write a program to implement k-Nearest Neighbour algorithm to classify the
iris\n",
"data set. Print both correct and wrong predictions. Java/Python ML library classes
can\n",
"be used for this problem.**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dataset=load_iris()\n",
"#print(dataset)\n",
"X_train,X_test,y_train,y_test=train_test_split(dataset[\"data\"],dataset[\"target\"],
random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n",
" weights='uniform')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kn=KNeighborsClassifier(n_neighbors=1)\n",
"kn.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 2 virginica PREDICTED= [2] ['virginica']\n",
"TARGET= 1 versicolor PREDICTED= [1] ['versicolor']\n",
"TARGET= 0 setosa PREDICTED= [0] ['setosa']\n",
"TARGET= 1 versicolor PREDICTED= [2] ['virginica']\n",
"0.9736842105263158\n"
]
}
],
"source": [
"for i in range(len(X_test)):\n",
" x=X_test[i]\n",
" x_new=np.array([x])\n",
" prediction=kn.predict(x_new)\n",
"
print(\"TARGET=\",y_test[i],dataset[\"target_names\"][y_test[i]],\"PREDICTED=\",pr
ediction,dataset[\"target_names\"][prediction])\n",
"print(kn.score(X_test,y_test))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
10. Implement the non parametric locally weighted regression algorithm in
order to fit data points. Select appropriate date sets for your experiment and
draw graph?
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MACHINE LEARNING LAB - 10 ( Locally Weighted Regression Algorithm )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**10. Implement the non-parametric Locally Weighted Regression Algorithm in
order to\n",
"fit data points. Select appropriate data set for your experiment and draw
graphs.**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from math import ceil\n",
"import numpy as np\n",
"from scipy import linalg"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def lowess(x, y, f, iterations):\n",
" n = len(x)\n",
" r = int(ceil(f * n))\n",
" h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]\n",
" w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)\n",
" w = (1 - w ** 3) ** 3\n",
" yest = np.zeros(n)\n",
" delta = np.ones(n)\n",
" for iteration in range(iterations):\n",
" for i in range(n):\n",
" weights = delta * w[:, i]\n",
" b = np.array([np.sum(weights * y), np.sum(weights * y * x)])\n",
" A = np.array([[np.sum(weights), np.sum(weights * x)],[np.sum(weights * x),
np.sum(weights * x * x)]])\n",
" beta = linalg.solve(A, b)\n",
" yest[i] = beta[0] + beta[1] * x[i]\n",
"\n",
" residuals = y - yest\n",
" s = np.median(np.abs(residuals))\n",
" delta = np.clip(residuals / (6.0 * s), -1, 1)\n",
" delta = (1 - delta ** 2) ** 2\n",
"\n",
" return yest"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x37459696d8>]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png":
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import math\n",
"n = 100\n",
"x = np.linspace(0, 2 * math.pi, n)\n",
"y = np.sin(x) + 0.3 * np.random.randn(n)\n",
"f =0.25\n",
"iterations=3\n",
"yest = lowess(x, y, f, iterations)\n",
" \n",
"import matplotlib.pyplot as plt\n",
"plt.plot(x,y,\"r.\")\n",
"plt.plot(x,yest,\"b-\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}