Data Science Lab Experiments
Data Science Lab Experiments
AIM: Write a program to implement the Naïve Bayesian classifier for a sample training
data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test
data sets.
Source Code:
# import necessary libarities
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
# load data from CSV
data = pd.read_csv('tennisdata.csv')
print ("The first 5 values of data is:\n",data.head())
y = data.iloc[:,-1]
print ("\n the first 5 values of Train output is\n",y.head())
le_PlayTennis = LabelEncoder()
y = le_PlayTennis.fit_transform(y)
print("\nNow the Train output is\n",y)
AIM: Write a program to demonstrate the working of the decision tree-based ID3
algorithm. Use an appropriate data set for building the decision tree and apply this
knowledge to classify a new sample.
import pandas as pd
import numpy as np
import math
data = pd.read_csv("dataset.csv")
features = [feat for feat in data.columns if feat != "answer"]
class Node:
def __init__(self):
self.children = []
self.value = ""
self.isLeaf = False
self.pred = ""
def entropy(examples):
pos = sum(examples["answer"] == "yes")
neg = sum(examples["answer"] == "no")
total = len(examples)
if pos == 0 or neg == 0:
return 0.0
else:
p = pos / total
n = neg / total
return -(p * math.log2(p) + n * math.log2(n))
def info_gain(examples, attr):
uniq = np.unique(examples[attr])
gain = entropy(examples)
for u in uniq:
subdata = examples[examples[attr] == u]
sub_e = entropy(subdata)
gain -= (len(subdata) / len(examples)) * sub_e
return gain
rain
wind
strong -> ['no']
sunny
humidity
high -> ['no']
ypred = classifier.predict(Xtest)
i=0
print ("\n-------------------------------------------------------------------------")
print ('%-25s %-25s %-25s' % ('Original Label', 'Predicted Label', 'Correct/Wrong'))
print ("-------------------------------------------------------------------------")
for label in ytest:
print ('%-25s %-25s' % (label, ypred[i]), end="")
if (label == ypred[i]):
print (' %-25s' % ('Correct'))
else:
print (' %-25s' % ('Wrong'))
i=i+1
print ("-------------------------------------------------------------------------")
print("\nConfusion Matrix:\n",metrics.confusion_matrix(ytest, ypred))
print ("-------------------------------------------------------------------------")
print("\nClassification Report:\n",metrics.classification_report(ytest, ypred))
print ("-------------------------------------------------------------------------")
print('Accuracy of the classifer is %0.2f' % metrics.accuracy_score(ytest,ypred))
print ("-------------------------------------------------------------------------")
OUTPUT
sepal-length sepal-width petal-length petal-width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
-------------------------------------------------------------------------
Original Label Predicted Label Correct/Wrong
-------------------------------------------------------------------------
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-versicolor Wrong
Iris-virginica Iris-virginica Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
-------------------------------------------------------------------------
Confusion Matrix:
[[4 0 0]
[0 3 0]
[0 1 7]]
-------------------------------------------------------------------------
Classification Report:
precision recall f1-score support
accuracy 0.93 15
macro avg 0.92 0.96 0.93 15
weighted avg 0.95 0.93 0.94 15
-------------------------------------------------------------------------
Accuracy of the classifer is 0.93
-------------------------------------------------------------------------
EXPERIMENT-8
AIM: Write a program to implement k-Means clustering algorithm to cluster the
set of data stored in .CSV file. Compare the results of various “k” values for the
quality of clustering.
Source Code:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width', 'Class']
dataset = pd.read_csv("9-dataset.csv", names=names)
X = dataset.iloc[:, :-1]
label = {'Iris-setosa': 0,'Iris-versicolor': 1, 'Iris-virginica': 2}
y = [label[c]for c in dataset.iloc[:, -1]]
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
# REAL PLOT
plt.subplot(1,3,1)
plt.title('Real')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y])
# K-PLOT
model=KMeans(n_clusters=3, random_state=0).fit(X)
plt.subplot(1,3,2)
plt.title('KMeans')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[model.labels_])
print('The accuracy score of K-Mean: ',metrics.accuracy_score(y, model.labels_))
print('The Confusion matrixof K-Mean:\n',metrics.confusion_matrix(y, model.labels_))
# GMM PLOT
gmm=GaussianMixture(n_components=3, random_state=0).fit(X)
y_cluster_gmm=gmm.predict(X)
plt.subplot(1,3,3)
plt.title('GMM Classification')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm])
print('The accuracy score of EM: ',metrics.accuracy_score(y, y_cluster_gmm))
print('The Confusion matrix of EM:\n ',metrics.confusion_matrix(y, y_cluster_gmm))
plt.show()
OUTPUT:
The accuracy score of K-Mean: 0.24
The Confusion matrixof K-Mean:
[[ 0 50 0]
[47 0 3]
[14 0 36]]
The accuracy score of EM: 0.3333333333333333
The Confusion matrix of EM:
[[ 0 50 0]
[45 0 5]
[ 0 0 50]]
EXPERIMENT-9
AIM: Write a program to build Artificial Neural Network and test the same
using appropriate data sets.
SOURCE CODE:
import numpy as np
class NeuralNet(object):
def __init__(self):
# Generate random numbers
np.random.seed(1)
# Assign random weights to a 3 x 1 matrix,
self.synaptic_weights = 2 * np.random.random((3, 1)) - 1
# The Sigmoid function
def __sigmoid(self, x):
return 1 / (1 + np.exp(-x))
# The derivative of the Sigmoid function.
# This is the gradient of the Sigmoid curve.
def __sigmoid_derivative(self, x):
return x * (1 - x)
# Train the neural network and adjust the weights each time.
def train(self, inputs, outputs, training_iterations):
for iteration in range(training_iterations):
# Pass the training set through the network.
output = self.learn(inputs)
# Calculate the error
error = outputs - output
# Adjust the weights by a factor
factor = np.dot(inputs.T, error * self.__sigmoid_derivative(output))
self.synaptic_weights += factor
OUTPUT:
[0.9897704]