0% found this document useful (0 votes)
23 views13 pages

DM ML Practical

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views13 pages

DM ML Practical

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

Naïve Bayes Classifier Program

import numpy as np
from collections import defaultdict

# Step 1: Prepare a simple dataset


data = [
('spam', 'buy cheap amazon products now'),
('ham', 'how are you doing today'),
('spam', 'cheap watches on sale'),
('ham', 'let us meet up tomorrow'),
('spam', 'win a million dollars now'),
('ham', 'can you call me back later'),
]

# Step 2: Preprocess and count word frequencies


class NaiveBayesClassifier:
def __init__(self):
self.word_probs = defaultdict(lambda: defaultdict(float)) #
Word probabilities
self.class_probs = defaultdict(float) # Class probabilities
self.vocab = set() # Vocabulary

def train(self, dataset):


# Count occurrences
class_word_counts = defaultdict(lambda: defaultdict(int)) #
Count of words per class
class_counts = defaultdict(int) # Count of each class

for label, text in dataset:


words = text.split()
class_counts[label] += 1
for word in words:
self.vocab.add(word)
class_word_counts[label][word] += 1

# Calculate class probabilities P(C)


total_samples = sum(class_counts.values())
for label, count in class_counts.items():
self.class_probs[label] = count / total_samples

# Calculate word probabilities P(w|C)


for label, words in class_word_counts.items():
total_words = sum(words.values())
for word in self.vocab:
# Additive smoothing (Laplace smoothing)
self.word_probs[label][word] = (words[word] + 1) /
(total_words + len(self.vocab))

def predict(self, text):


words = text.split()
# Calculate P(C|w1, w2, ..., wn) for each class
class_scores = {}
for label in self.class_probs:
class_scores[label] = np.log(self.class_probs[label]) # Log
of prior P(C)
for word in words:
if word in self.vocab:
class_scores[label] += np.log(
self.word_probs[label].get(word, 1 /
len(self.vocab))) # Log of P(w|C)

# Return the class with the highest score


return max(class_scores, key=class_scores.get)

# Step 3: Train the classifier


classifier = NaiveBayesClassifier()
classifier.train(data)

# Step 4: Classify new examples


test_texts = [
'cheap watches available',
'how are you',
'call me now to win',
]

for text in test_texts:


prediction = classifier.predict(text)
print(f'Text: "{text}" => Predicted class: {prediction}')
KNN
import matplotlib.pyplot as plt

x = [4, 5, 10, 4, 3, 11, 14 , 8, 10, 12]


y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]
classes = [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]

plt.scatter(x, y, c=classes)
plt.show()
from sklearn.neighbors import KNeighborsClassifier

data = list(zip(x, y))


knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(data, classes)
new_x = 8
new_y = 21
new_point = [(new_x, new_y)]

prediction = knn.predict(new_point)

plt.scatter(x + [new_x], y + [new_y], c=classes + [prediction[0]])


plt.text(x=new_x-1.7, y=new_y-0.7, s=f"new point, class:
{prediction[0]}")
plt.show()
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(data, classes)

prediction = knn.predict(new_point)

plt.scatter(x + [new_x], y + [new_y], c=classes + [prediction[0]])


plt.text(x=new_x-1.7, y=new_y-0.7, s=f"new point, class:
{prediction[0]}")
plt.show()

DBSCAN
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

# Load data in X
X, y_true = make_blobs(n_samples=300, centers=4,
cluster_std=0.50, random_state=0)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

# Plot result

# Black removed and is used for noise instead.


unique_labels = set(labels)
colors = ['y', 'b', 'g', 'r']
print(colors)
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'

class_member_mask = (labels == k)

xy = X[class_member_mask & core_samples_mask]


plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',
markersize=6)

xy = X[class_member_mask & ~core_samples_mask]


plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',
markersize=6)
plt.title('number of clusters: %d' % n_clusters_)
plt.show()

Support Vector Machine


# importing scikit learn with make_blobs
from sklearn.datasets import make_blobs

# creating datasets X containing n_samples


# Y containing two classes
X, Y = make_blobs(n_samples=500, centers=2,
random_state=0, cluster_std=0.40)
import matplotlib.pyplot as plt
# plotting scatters
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring');
plt.show()

# creating linspace between -1 to 3.5


xfit = np.linspace(-1, 3.5)

# plotting scatter
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring')

# plot a line between the different sets of data


for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
color='#AAAAAA', alpha=0.4)

plt.xlim(-1, 3.5);
plt.show()
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# reading csv file and extracting class column to y.


x = pd.read_csv("C:\...\cancer.csv")
a = np.array(x)
y = a[:,30] # classes having 0 and 1

# extracting two features


x = np.column_stack((x.malignant,x.benign))

# 569 samples and 2 features


x.shape

print (x),(y)

# import support vector classifier


# "Support Vector Classifier"
from sklearn.svm import SVC
clf = SVC(kernel='linear')
# fitting x samples and y classes
clf.fit(x, y)

clf.predict([[120, 990]])

clf.predict([[85, 550]])

CART
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Define the features and target variable


features = [
["red", "large"],
["green", "small"],
["red", "small"],
["yellow", "large"],
["green", "large"],
["orange", "large"],
]
target_variable = ["apple", "lime", "strawberry", "banana",
"grape", "orange"]

# Flatten the features list for encoding


flattened_features = [item for sublist in features for item in
sublist]
# Use a single LabelEncoder for all features and target variable
le = LabelEncoder()
le.fit(flattened_features + target_variable)

# Encode features and target variable


encoded_features = [le.transform(item) for item in features]
encoded_target = le.transform(target_variable)

# Create a CART classifier


clf = DecisionTreeClassifier()

# Train the classifier on the training set


clf.fit(encoded_features, encoded_target)

# Predict the fruit type for a new instance


new_instance = ["red", "large"]
encoded_new_instance = le.transform(new_instance)
predicted_fruit_type = clf.predict([encoded_new_instance])
decoded_predicted_fruit_type =
le.inverse_transform(predicted_fruit_type)
print("Predicted fruit type:", decoded_predicted_fruit_type[0])

KMEANS
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples = 500,n_features = 2,centers =
3,random_state = 23)

fig = plt.figure(0)
plt.grid(True)
plt.scatter(X[:,0],X[:,1])
plt.show()
k=3

clusters = {}
np.random.seed(23)

for idx in range(k):


center = 2*(2*np.random.random((X.shape[1],))-1)
points = []
cluster = {
'center' : center,
'points' : []
}

clusters[idx] = cluster

clusters
plt.scatter(X[:,0],X[:,1])
plt.grid(True)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '*',c = 'red')
plt.show()
def distance(p1,p2):
return np.sqrt(np.sum((p1-p2)**2))
def assign_clusters(X, clusters):
for idx in range(X.shape[0]):
dist = []

curr_x = X[idx]

for i in range(k):
dis = distance(curr_x,clusters[i]['center'])
dist.append(dis)
curr_cluster = np.argmin(dist)
clusters[curr_cluster]['points'].append(curr_x)
return clusters

#Implementing the M-Step


def update_clusters(X, clusters):
for i in range(k):
points = np.array(clusters[i]['points'])
if points.shape[0] > 0:
new_center = points.mean(axis =0)
clusters[i]['center'] = new_center

clusters[i]['points'] = []
return clusters
def pred_cluster(X, clusters):
pred = []
for i in range(X.shape[0]):
dist = []
for j in range(k):
dist.append(distance(X[i],clusters[j]['center']))
pred.append(np.argmin(dist))
return pred
clusters = assign_clusters(X,clusters)
clusters = update_clusters(X,clusters)
pred = pred_cluster(X,clusters)
plt.scatter(X[:,0],X[:,1],c = pred)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.show()

You might also like