0% found this document useful (0 votes)
35 views58 pages

Naive Bayes

The document discusses several machine learning algorithms including naive bayes, linear regression, polynomial regression, k-nearest neighbors (KNN), and elastic net regression. Naive bayes, linear regression, and KNN are introduced and example code is provided to demonstrate implementation and use of each algorithm. Polynomial regression expands on linear regression by adding higher degree terms. Multiple linear regression and elastic net regression techniques are also overviewed.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
35 views58 pages

Naive Bayes

The document discusses several machine learning algorithms including naive bayes, linear regression, polynomial regression, k-nearest neighbors (KNN), and elastic net regression. Naive bayes, linear regression, and KNN are introduced and example code is provided to demonstrate implementation and use of each algorithm. Polynomial regression expands on linear regression by adding higher degree terms. Multiple linear regression and elastic net regression techniques are also overviewed.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 58

NAIVE BAYES

# load the iris dataset


from sklearn.datasets import load_iris
iris = load_iris()

# store the feature matrix (X) and response vector (y)


X = iris.data
y = iris.target

# splitting X and y into training and testing sets


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
random_state=1)

# training the model on training set


from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# making predictions on the testing set


y_pred = gnb.predict(X_test)

# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test,
y_pred)*100)

--------------------------------------------------------------------
linear regression

import numpy as np
import matplotlib.pyplot as plt

def estimate_coef(x, y):


# number of observations/points
n = np.size(x)

# mean of x and y vector


m_x = np.mean(x)
m_y = np.mean(y)

# calculating cross-deviation and deviation about x


SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x

# calculating regression coefficients


b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x

return (b_0, b_1)

def plot_regression_line(x, y, b):


# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",
marker = "o", s = 30)

# predicted response vector


y_pred = b[0] + b[1]*x

# plotting the regression line


plt.plot(x, y_pred, color = "g")

# putting labels
plt.xlabel('x')
plt.ylabel('y')

def main():
# observations / data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])

# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {}\
\nb_1 = {}".format(b

-----------------------------------

multiple linear regression


from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, metrics

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+",
skiprows=22, header=None)

X = np.hstack([raw_df.values[::2, :],
raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

X_train, X_test,\
y_train, y_test = train_test_split(X, y,
test_size=0.4,
random_state=1)

reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

# regression coefficients
print('Coefficients: ', reg.coef_)

# variance score: 1 means perfect prediction


print('Variance score: {}'.format(reg.score(X_test, y_test)))
# plot for residual error

# setting plot style


plt.style.use('fivethirtyeight')

# plotting residual errors in training data


plt.scatter(reg.predict(X_train),
reg.predict(X_train) - y_train,
color="green", s=10,
label='Train data')

# plotting residual errors in test data


plt.scatter(reg.predict(X_test),
reg.predict(X_test) - y_test,
color="blue", s=10,
label='Test data')

# plotting line for zero residual error


plt.hlines(y=0, xmin=0, xmax=50, linewidth=2)

# plotting legend
plt.legend(loc='upper right')

# plot title
plt.title("Residual errors")

# method call for showing the plot


plt.show()

---------------------------
polynomial

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

from sklearn.preprocessing import LabelEncoder


from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

df = pd.read_csv('Position_Salaries.csv')

X = df.iloc[:,1:2].values
y = df.iloc[:,2].values
x

from sklearn.linear_model import LinearRegression


lin_reg=LinearRegression()
lin_reg.fit(X,y)

from sklearn.preprocessing import PolynomialFeatures


poly_reg2=PolynomialFeatures(degree=2)
X_poly=poly_reg2.fit_transform(X)
lin_reg_2=LinearRegression()
lin_reg_2.fit(X_poly,y)

poly_reg3=PolynomialFeatures(degree=3)
X_poly3=poly_reg3.fit_transform(X)
lin_reg_3=LinearRegression()
lin_reg_3.fit(X_poly3,y)

plt.scatter(X,y,color='red')
plt.plot(X,lin_reg.predict(X),color='green')
plt.title('Simple Linear Regression')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()

plt.style.use('fivethirtyeight')
plt.scatter(X,y,color='red')
plt.plot(X,lin_reg_2.predict(poly_reg2.fit_transform(X)),color='green')
plt.plot(X,lin_reg_3.predict(poly_reg3.fit_transform(X)),color='yellow')
plt.title('Polynomial Linear Regression Degree 2')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
plt.style.use('fivethirtyeight')
X_grid=np.arange(min(X),max(X),0.1) # This will give us a vector.We will have to
convert this into a matrix
X_grid=X_grid.reshape((len(X_grid),1))
plt.scatter(X,y,color='red')
plt.plot(X_grid,lin_reg_3.predict(poly_reg3.fit_transform(X_grid)),color='lightgree
n')

#plt.plot(X,lin_reg_3.predict(poly_reg3.fit_transform(X)),color='green')
plt.title('Polynomial Linear Regression Degree 3')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()

--------------------------------------------------

KNN

import math

def classifyAPoint(points,p,k=3):
'''
This function finds the classification of p using
k nearest neighbor algorithm. It assumes only two
groups and returns 0 if p belongs to group 0, else
1 (belongs to group 1).

Parameters -
points: Dictionary of training points having two keys - 0 and 1
Each key have a list of training data points belong to that

p : A tuple, test data point of the form (x,y)

k : number of nearest neighbour to consider, default is 3


'''

distance=[]
for group in points:
for feature in points[group]:

#calculate the euclidean distance of p from training points


euclidean_distance = math.sqrt((feature[0]-p[0])**2 +(feature[1]-
p[1])**2)

# Add a tuple of form (distance,group) in the distance list


distance.append((euclidean_distance,group))

# sort the distance list in ascending order


# and select first k distances
distance = sorted(distance)[:k]

freq1 = 0 #frequency of group 0


freq2 = 0 #frequency og group 1

for d in distance:
if d[1] == 0:
freq1 += 1
elif d[1] == 1:
freq2 += 1

return 0 if freq1>freq2 else 1

# driver function
def main():

# Dictionary of training points having two keys - 0 and 1


# key 0 have points belong to class 0
# key 1 have points belong to class 1

points = {0:[(1,12),(2,5),(3,6),(3,10),(3.5,8),(2,11),(2,9),(1,7)],
1:[(5,3),(3,2),(1.5,9),(7,2),(6,1),(3.8,1),(5.6,4),(4,2),(2,5)]}

# testing point p(x,y)


p = (2.5,7)

# Number of neighbours
k = 3

print("The value classified to unknown point is: {}".\


format(classifyAPoint(points,p,k)))

if __name__ == '__main__':
main()

--------------------------------------------------------------

ELASTIC NET REGRESSION

# Importing libraries

import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Elastic Net Regression

class ElasticRegression() :

def __init__( self, learning_rate, iterations, l1_penality, l2_penality ) :

self.learning_rate = learning_rate

self.iterations = iterations

self.l1_penality = l1_penality

self.l2_penality = l2_penality

# Function for model training

def fit( self, X, Y ) :

# no_of_training_examples, no_of_features

self.m, self.n = X.shape

# weight initialization

self.W = np.zeros( self.n )

self.b = 0

self.X = X

self.Y = Y

# gradient descent learning

for i in range( self.iterations ) :

self.update_weights()

return self

# Helper function to update weights in gradient descent

def update_weights( self ) :

Y_pred = self.predict( self.X )

# calculate gradients

dW = np.zeros( self.n )

for j in range( self.n ) :

if self.W[j] > 0 :

dW[j] = ( - ( 2 * ( self.X[:,j] ).dot( self.Y - Y_pred ) )


+

self.l1_penality + 2 * self.l2_penality *
self.W[j] ) / self.m

else :

dW[j] = ( - ( 2 * ( self.X[:,j] ).dot( self.Y - Y_pred ) )

- self.l1_penality + 2 * self.l2_penality *
self.W[j] ) / self.m

db = - 2 * np.sum( self.Y - Y_pred ) / self.m

# update weights

self.W = self.W - self.learning_rate * dW

self.b = self.b - self.learning_rate * db

return self

# Hypothetical function h( x )

def predict( self, X ) :

return X.dot( self.W ) + self.b

# Driver Code

def main() :

# Importing dataset

df = pd.read_csv( "salary_data.csv" )

X = df.iloc[:,:-1].values

Y = df.iloc[:,1].values

# Splitting dataset into train and test set

X_train, X_test, Y_train, Y_test = train_test_split( X, Y,

test_size = 1/3,
random_state = 0 )

# Model training

model = ElasticRegression( iterations = 1000,

learning_rate = 0.01, l1_penality = 500, l2_penality


= 1 )

model.fit( X_train, Y_train )

# Prediction on test set


Y_pred = model.predict( X_test )

print( "Predicted values ", np.round( Y_pred[:3], 2 ) )

print( "Real values ", Y_test[:3] )

print( "Trained W ", round( model.W[0], 2 ) )

print( "Trained b ", round( model.b, 2 ) )

# Visualization on test set

plt.scatter( X_test, Y_test, color = 'blue' )

plt.plot( X_test, Y_pred, color = 'orange' )

plt.title( 'Salary vs Experience' )

plt.xlabel( 'Years of Experience' )

plt.ylabel( 'Salary' )

plt.show()

if __name__ == "__main__" :

main()

----------------------------------------------------------------------------------
SVM

# Load the important packages


from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import SVC

# Load the datasets


cancer = load_breast_cancer()
X = cancer.data[:, :2]
y = cancer.target

#Build the model


svm = SVC(kernel="rbf", gamma=0.5, C=1.0)
# Trained the model
svm.fit(X, y)

# Plot Decision Boundary


DecisionBoundaryDisplay.from_estimator(
svm,
X,
response_method="predict",
cmap=plt.cm.Spectral,
alpha=0.8,
xlabel=cancer.feature_names[0],
ylabel=cancer.feature_names[1],
)

# Scatter plot
plt.scatter(X[:, 0], X[:, 1],
c=y,
s=20, edgecolors="k")
plt.show()

---------------------------------------------------------------------------

SVM------2

# importing scikit learn with make_blobs


from sklearn.datasets import make_blobs

# creating datasets X containing n_samples


# Y containing two classes
X, Y = make_blobs(n_samples=500, centers=2,
random_state=0, cluster_std=0.40)
import matplotlib.pyplot as plt
# plotting scatters
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring');
plt.show()

# creating linspace between -1 to 3.5


xfit = np.linspace(-1, 3.5)

# plotting scatter
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring')

# plot a line between the different sets of data


for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
color='#AAAAAA', alpha=0.4)

plt.xlim(-1, 3.5);
plt.show()
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# reading csv file and extracting class column to y.


x = pd.read_csv("C:\...\cancer.csv")
a = np.array(x)
y = a[:,30] # classes having 0 and 1

# extracting two features


x = np.column_stack((x.malignant,x.benign))

# 569 samples and 2 features


x.shape

print (x),(y)

# import support vector classifier


# "Support Vector Classifier"
from sklearn.svm import SVC
clf = SVC(kernel='linear')

# fitting x samples and y classes


clf.fit(x, y)

clf.predict([[120, 990]])

clf.predict([[85, 550]])

--------------------------------------------------------------------------

DECISION TREE

# Importing the required packages


import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# Function to import the dataset


def importdata():
balance_data = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-' +
'databases/balance-scale/balance-scale.data',
sep=',', header=None)

# Displaying dataset information


print("Dataset Length: ", len(balance_data))
print("Dataset Shape: ", balance_data.shape)
print("Dataset: ", balance_data.head())

return balance_data

# Function to split the dataset into features and target variables


def splitdataset(balance_data):

# Separating the target variable


X = balance_data.values[:, 1:5]
Y = balance_data.values[:, 0]

# Splitting the dataset into train and test


X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=100)

return X, Y, X_train, X_test, y_train, y_test

def train_using_gini(X_train, X_test, y_train):

# Creating the classifier object


clf_gini = DecisionTreeClassifier(criterion="gini",
random_state=100,
max_depth=3, min_samples_leaf=5)

# Performing training
clf_gini.fit(X_train, y_train)
return clf_gini

def tarin_using_entropy(X_train, X_test, y_train):

# Decision tree with entropy


clf_entropy = DecisionTreeClassifier(
criterion="entropy", random_state=100,
max_depth=3, min_samples_leaf=5)

# Performing training
clf_entropy.fit(X_train, y_train)
return clf_entropy

# Function to make predictions


def prediction(X_test, clf_object):
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred

# Placeholder function for cal_accuracy


def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",
confusion_matrix(y_test, y_pred))
print("Accuracy : ",
accuracy_score(y_test, y_pred)*100)
print("Report : ",
classification_report(y_test, y_pred))

# Function to plot the decision tree


def plot_decision_tree(clf_object, feature_names, class_names):
plt.figure(figsize=(15, 10))
plot_tree(clf_object, filled=True, feature_names=feature_names,
class_names=class_names, rounded=True)
plt.show()

if __name__ == "__main__":
data = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data)

clf_gini = train_using_gini(X_train, X_test, y_train)


clf_entropy = train_using_entropy(X_train, X_test, y_train)

# Visualizing the Decision Trees


plot_decision_tree(clf_gini, ['X1', 'X2', 'X3', 'X4'], ['L', 'B', 'R'])
plot_decision_tree(clf_entropy, ['X1', 'X2', 'X3', 'X4'], ['L', 'B', 'R'])

# Operational Phase
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)

-----------------------------------------------------------------------------

CART ( CLASSIFICATION AND REGRESSION )

REGRESSION DECISION TREE

from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import LabelEncoder

# Define the features and target variable


features = [
["red", "large"],
["green", "small"],
["red", "small"],
["yellow", "large"],
["green", "large"],
["orange", "large"],
]
target_variable = ["apple", "lime", "strawberry", "banana", "grape", "orange"]

# Flatten the features list for encoding


flattened_features = [item for sublist in features for item in sublist]

# Use a single LabelEncoder for all features and target variable


le = LabelEncoder()
le.fit(flattened_features + target_variable)

# Encode features and target variable


encoded_features = [le.transform(item) for item in features]
encoded_target = le.transform(target_variable)

# Create a CART classifier


clf = DecisionTreeClassifier()
# Train the classifier on the training set
clf.fit(encoded_features, encoded_target)

# Predict the fruit type for a new instance


new_instance = ["red", "large"]
encoded_new_instance = le.transform(new_instance)
predicted_fruit_type = clf.predict([encoded_new_instance])
decoded_predicted_fruit_type = le.inverse_transform(predicted_fruit_type)
print("Predicted fruit type:", decoded_predicted_fruit_type[0])

---------------------------------
Decision Tree Regression USING SKLEARN

# import numpy package for arrays and stuff


import numpy as np

# import matplotlib.pyplot for plotting our result


import matplotlib.pyplot as plt

# import pandas for importing csv files


import pandas as pd

# import dataset
# dataset = pd.read_csv('Data.csv')
# alternatively open up .csv file to read data

dataset = np.array(
[['Asset Flip', 100, 1000],
['Text Based', 500, 3000],
['Visual Novel', 1500, 5000],
['2D Pixel Art', 3500, 8000],
['2D Vector Art', 5000, 6500],
['Strategy', 6000, 7000],
['First Person Shooter', 8000, 15000],
['Simulator', 9500, 20000],
['Racing', 12000, 21000],
['RPG', 14000, 25000],
['Sandbox', 15500, 27000],
['Open-World', 16500, 30000],
['MMOFPS', 25000, 52000],
['MMORPG', 30000, 80000]
])

# print the dataset


print(dataset)
# select all rows by : and column 1
# by 1:2 representing features
X = dataset[:, 1:2].astype(int)

# print X
print(X)

# select all rows by : and column 2


# by 2 to Y representing labels
y = dataset[:, 2].astype(int)

# print y
print(y)

# import the regressor


from sklearn.tree import DecisionTreeRegressor

# create a regressor object


regressor = DecisionTreeRegressor(random_state = 0)

# fit the regressor with X and Y data


regressor.fit(X, y)

# predicting a new value

# test the output by changing values, like 3750


y_pred = regressor.predict([[3750]])

# print the predicted price


print("Predicted price: % d\n"% y_pred)

# arange for creating a range of values


# from min value of X to max value of X
# with a difference of 0.01 between two
# consecutive values
X_grid = np.arange(min(X), max(X), 0.01)
# reshape for reshaping the data into
# a len(X_grid)*1 array, i.e. to make
# a column out of the X_grid values
X_grid = X_grid.reshape((len(X_grid), 1))

# scatter plot for original data


plt.scatter(X, y, color = 'red')

# plot predicted data


plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')

# specify title
plt.title('Profit to Production Cost (Decision Tree Regression)')

# specify X axis label


plt.xlabel('Production Cost')

# specify Y axis label


plt.ylabel('Profit')

# show the plot


plt.show()

# import export_graphviz
from sklearn.tree import export_graphviz

# export the decision tree to a tree.dot file


# for visualizing the plot easily anywhere
export_graphviz(regressor, out_file ='tree.dot',
feature_names =['Production Cost'])

------------------------------------------------------------------------------

K-MEAN CLUSTERING

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples = 500,n_features = 2,centers = 3,random_state = 23)

fig = plt.figure(0)
plt.grid(True)
plt.scatter(X[:,0],X[:,1])
plt.show()

k = 3

clusters = {}
np.random.seed(23)

for idx in range(k):


center = 2*(2*np.random.random((X.shape[1],))-1)
points = []
cluster = {
'center' : center,
'points' : []
}

clusters[idx] = cluster

clusters

plt.scatter(X[:,0],X[:,1])
plt.grid(True)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '*',c = 'red')
plt.show()

def distance(p1,p2):
return np.sqrt(np.sum((p1-p2)**2))
#Implementing E step
def assign_clusters(X, clusters):
for idx in range(X.shape[0]):
dist = []

curr_x = X[idx]

for i in range(k):
dis = distance(curr_x,clusters[i]['center'])
dist.append(dis)
curr_cluster = np.argmin(dist)
clusters[curr_cluster]['points'].append(curr_x)
return clusters

#Implementing the M-Step


def update_clusters(X, clusters):
for i in range(k):
points = np.array(clusters[i]['points'])
if points.shape[0] > 0:
new_center = points.mean(axis =0)
clusters[i]['center'] = new_center

clusters[i]['points'] = []
return clusters

def pred_cluster(X, clusters):


pred = []
for i in range(X.shape[0]):
dist = []
for j in range(k):
dist.append(distance(X[i],clusters[j]['center']))
pred.append(np.argmin(dist))
return pred

clusters = assign_clusters(X,clusters)
clusters = update_clusters(X,clusters)
pred = pred_cluster(X,clusters)

plt.scatter(X[:,0],X[:,1],c = pred)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.show()

-----------------------------------

EXAMPLE TWO K-MEAN CLUSTERING

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans

X, y = load_iris(return_X_y=True)

#Find optimum number of cluster


sse = [] #SUM OF SQUARED ERROR
for k in range(1,11):
km = KMeans(n_clusters=k, random_state=2)
km.fit(X)
sse.append(km.inertia_)

sns.set_style("whitegrid")
g=sns.lineplot(x=range(1,11), y=sse)

g.set(xlabel ="Number of cluster (k)",


ylabel = "Sum Squared Error",
title ='Elbow Method')

plt.show()

kmeans = KMeans(n_clusters = 3, random_state = 2)


kmeans.fit(X)
kmeans.cluster_centers_

pred = kmeans.fit_predict(X)
pred

# PLOT the cluster center with data points

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.scatter(X[:,0],X[:,1],c = pred, cmap=cm.Accent)
plt.grid(True)
for center in kmeans.cluster_centers_:
center = center[:2]
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.xlabel("petal length (cm)")
plt.ylabel("petal width (cm)")

plt.subplot(1,2,2)
plt.scatter(X[:,2],X[:,3],c = pred, cmap=cm.Accent)
plt.grid(True)
for center in kmeans.cluster_centers_:
center = center[2:4]
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.xlabel("sepal length (cm)")
plt.ylabel("sepal width (cm)")
plt.show()

-------------------------------------------------------------------
Customer Segmentation using Unsupervised Machine Learning in Python
by k mean clustering

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import StandardScaler, LabelEncoder


from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('new.csv')
df.head()

df.shape

df.info()

df.describe().T

df['Accepted'] = df['Accepted'].str.replace('Accepted', '')

for col in df.columns:


temp = df[col].isnull().sum()
if temp > 0:
print(f'Column {col} contains {temp} null values.')

df = df.dropna()
print("Total missing values are:", len(df))
df.nunique()

parts = df["Dt_Customer"].str.split("-", n=3, expand=True)


df["day"] = parts[0].astype('int')
df["month"] = parts[1].astype('int')
df["year"] = parts[2].astype('int')

df.drop(['Z_CostContact', 'Z_Revenue', 'Dt_Customer'],


axis=1,
inplace=True)

floats, objects = [], []


for col in df.columns:
if df[col].dtype == object:
objects.append(col)
elif df[col].dtype == float:
floats.append(col)

print(objects)
print(floats)

plt.subplots(figsize=(15, 10))
for i, col in enumerate(objects):
plt.subplot(2, 2, i + 1)
sb.countplot(df[col])
plt.show()
df['Marital_Status'].value_counts()

for col in df.columns:


if df[col].dtype == object:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])

scaler = StandardScaler()
data = scaler.fit_transform(df)

from sklearn.manifold import TSNE


model = TSNE(n_components=2, random_state=0)
tsne_data = model.fit_transform(df)
plt.figure(figsize=(7, 7))
plt.scatter(tsne_data[:, 0], tsne_data[:, 1])
plt.show()

error = []
for n_clusters in range(1, 21):
model = KMeans(init='k-means++',
n_clusters=n_clusters,
max_iter=500,
random_state=22)
model.fit(df)
error.append(model.inertia_)
plt.figure(figsize=(10, 5))
sb.lineplot(x=range(1, 21), y=error)
sb.scatterplot(x=range(1, 21), y=error)
plt.show()

# create clustering model with optimal k=5


model = KMeans(init='k-means++',
n_clusters=5,
max_iter=500,
random_state=22)
segments = model.fit_predict(df)

plt.figure(figsize=(7, 7))
sb.scatterplot(tsne_data[:, 0], tsne_data[:, 1], hue=segments)
plt.show()

-------------------------------------------------------------

Mini-Batch Gradient Descent:

# importing dependencies
import numpy as np
import matplotlib.pyplot as plt

# creating data
mean = np.array([5.0, 6.0])
cov = np.array([[1.0, 0.95], [0.95, 1.2]])
data = np.random.multivariate_normal(mean, cov, 8000)

# visualising data
plt.scatter(data[:500, 0], data[:500, 1], marker='.')
plt.show()

# train-test-split
data = np.hstack((np.ones((data.shape[0], 1)), data))

split_factor = 0.90
split = int(split_factor * data.shape[0])

X_train = data[:split, :-1]


y_train = data[:split, -1].reshape((-1, 1))
X_test = data[split:, :-1]
y_test = data[split:, -1].reshape((-1, 1))

print(& quot
Number of examples in training set= % d & quot
% (X_train.shape[0]))
print(& quot
Number of examples in testing set= % d & quot
% (X_test.shape[0]))

# linear regression using "mini-batch" gradient descent


# function to compute hypothesis / predictions

def hypothesis(X, theta):


return np.dot(X, theta)

# function to compute gradient of error function w.r.t. theta

def gradient(X, y, theta):


h = hypothesis(X, theta)
grad = np.dot(X.transpose(), (h - y))
return grad

# function to compute the error for current values of theta

def cost(X, y, theta):


h = hypothesis(X, theta)
J = np.dot((h - y).transpose(), (h - y))
J /= 2
return J[0]

# function to create a list containing mini-batches

def create_mini_batches(X, y, batch_size):


mini_batches = []
data = np.hstack((X, y))
np.random.shuffle(data)
n_minibatches = data.shape[0] // batch_size
i = 0

for i in range(n_minibatches + 1):


mini_batch = data[i * batch_size:(i + 1)*batch_size, :]
X_mini = mini_batch[:, :-1]
Y_mini = mini_batch[:, -1].reshape((-1, 1))
mini_batches.append((X_mini, Y_mini))
if data.shape[0] % batch_size != 0:
mini_batch = data[i * batch_size:data.shape[0]]
X_mini = mini_batch[:, :-1]
Y_mini = mini_batch[:, -1].reshape((-1, 1))
mini_batches.append((X_mini, Y_mini))
return mini_batches

# function to perform mini-batch gradient descent

def gradientDescent(X, y, learning_rate=0.001, batch_size=32):


theta = np.zeros((X.shape[1], 1))
error_list = []
max_iters = 3
for itr in range(max_iters):
mini_batches = create_mini_batches(X, y, batch_size)
for mini_batch in mini_batches:
X_mini, y_mini = mini_batch
theta = theta - learning_rate * gradient(X_mini, y_mini, theta)
error_list.append(cost(X_mini, y_mini, theta))

return theta, error_list

theta, error_list = gradientDescent(X_train, y_train)


print("Bias = ", theta[0])
print("Coefficients = ", theta[1:])

# visualising gradient descent


plt.plot(error_list)
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()
# predicting output for X_test
y_pred = hypothesis(X_test, theta)
plt.scatter(X_test[:, 1], y_test[:, ], marker='.')
plt.plot(X_test[:, 1], y_pred, color='orange')
plt.show()

# calculating error in predictions


error = np.sum(np.abs(y_test - y_pred) / y_test.shape[0])
print(& quot
Mean absolute error = & quot
, error)

------------------------------------------

# -*- coding: utf-8 -*-


"""Untitled15.ipynb

Automatically generated by Colaboratory.

Original file is located at


https://colab.research.google.com/drive/1j5YYWDWSSosYhtqstL5P8DuC7RrkpHwx
"""

import pandas as pd
import numpy as np

# Generating synthetic data of 50000 points between value1 and 100


value1 = 1 # lower bound
data = value1 + np.random.rand(50000, 2) * (100 - value1) # assuming 2 features,
modify as needed

# Creating a DataFrame
df = pd.DataFrame(data, columns=['Feature1', 'Feature2'])

# Save to CSV
df.to_csv('synthetic_data.csv', index=False)

# Read CSV file


df = pd.read_csv('synthetic_data.csv')

# Display the first few rows of the dataset


print(df.head())
# You can explore the data further using df.describe(), df.info(), etc.

import numpy as np

def kmeans_algorithm(data, k, max_iters=100, tolerance=1e-4):


# Randomly initialize centroids
centroids = data[np.random.choice(data.shape[0], k, replace=False)]

for _ in range(max_iters):
# Assign each data point to the closest centroid
distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
labels = np.argmin(distances, axis=0)

# Update centroids
new_centroids = np.array([data[labels == j].mean(axis=0) for j in
range(k)])

# Check for convergence


if np.linalg.norm(new_centroids - centroids) < tolerance:
break

centroids = new_centroids

return labels, centroids

import numpy as np
import matplotlib.pyplot as plt

def find_optimal_k(data, max_k=20):


inertia = []

for k in range(1, max_k + 1):


labels, centroids = kmeans_algorithm(data, k)
inertia.append(np.sum(np.min(np.linalg.norm(data - centroids[:,
np.newaxis], axis=2), axis=0)))

# Plotting the Elbow Method graph


plt.plot(range(1, max_k + 1), inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()

# Analyzing the Elbow Method graph to find the optimal K


diff = np.diff(inertia, 2)
k_optimal = np.argmax(diff) + 1

return k_optimal

# Assuming df is a DataFrame with columns 'Feature1' and 'Feature2'


data = df[['Feature1', 'Feature2']].values

# Find the optimal K


optimal_k = find_optimal_k(data)

print(f'The optimal value of K is: {optimal_k}')

# Apply K-Means clustering


labels, centroids = kmeans_algorithm(data, optimal_k)

# Plotting the clustered data


for cluster in range(optimal_k):
cluster_points = data[labels == cluster]
plt.scatter(cluster_points[:, 0], cluster_points[:, 1])

plt.title('K-Means Clustering')
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()

-----------------------------------------------------------------------------------
----------------------------------------------------------------------------------
Week1:Write a python program to import and export the data using pandas library

1. Manual Function
def load_csv(filepath):
data = []
col = []
checkcol = False
with open(filepath) as f:
for val in f.readlines():
val = val.replace("\n","")
val = val.split(',')
if checkcol is False:
col = val
checkcol = True
else:
data.append(val)
df = pd.DataFrame(data=data, columns=col)
return df
2. Numpy.loadtxt function
df = np.loadtxt('convertcsv.csv', delimeter = ',')
print(df[:5,:])
3. Numpy.genfromtxt()
data = np.genfromtxt('100 Sales Records.csv', delimiter=',')
>>> pd.DataFrame(data)
4. Pandas.read_csv()
>>> pdDf = pd.read_csv('100 Sales Record.csv')
>>> pdDf.head()
5. Pickle
with open('test.pkl','wb') as f:
pickle.dump(pdDf, f)
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--
WEEK-2: Data preprocessing

1. Handling missing values


 isnull()
 notnull()
 dropna()
 fillna()
 replace()
 interpolate()
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# creating a dataframe from list
df = pd.DataFrame(dict)
# using isnull() function
df.isnull()
# importing pandas package
import pandas as pd
# making data frame from csv file
data = pd.read_csv("employees.csv")

# creating bool series True for NaN values


bool_series = pd.isnull(data["Gender"])
# filtering data
# displaying data only with Gender = NaN
data[bool_series]
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# creating a dataframe using dictionary
df = pd.DataFrame(dict)
# using notnull() function
df.notnull()
# importing pandas package
import pandas as pd
# making data frame from csv file
data = pd.read_csv("employees.csv")
# creating bool series True for NaN values
bool_series = pd.notnull(data["Gender"])

# filtering data
# displaying data only with Gender = Not NaN
data[bool_series]
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
# filling missing value using fillna()
df.fillna(0)
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}

# creating a dataframe from dictionary


df = pd.DataFrame(dict)
# filling a missing value with
# previous ones
df.fillna(method ='pad')
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
# filling null value using fillna() function
df.fillna(method ='bfill')
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------

WEEK-3: Dimensionality Reduction


1. Implementation of PCA

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#import the breast _cancer dataset
from sklearn.datasets import load_breast_cancer
data=load_breast_cancer()
data.keys()
# Check the output classes
print(data['target_names'])
# Check the input attributes
print(data['feature_names'])
# construct a dataframe using pandas
df1=pd.DataFrame(data['data'],columns=data['feature_names'])
# Scale data before applying PCA
scaling=StandardScaler()
# Use fit and transform method
scaling.fit(df1)
Scaled_data=scaling.transform(df1)
# Set the n_components=3
principal=PCA(n_components=3)
principal.fit(Scaled_data)
x=principal.transform(Scaled_data)

# Check the dimensions of data after PCA


print(x.shape)
# Check the values of eigen vectors
# prodeced by principal components
principal.components_
plt.figure(figsize=(10,10))
plt.scatter(x[:,0],x[:,1],c=data['target'],cmap='plasma')
plt.xlabel('pc1')
plt.ylabel('pc2')
# import relevant libraries for 3d graph
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
# choose projection 3d for creating a 3d graph
axis = fig.add_subplot(111, projection='3d')
# x[:,0]is pc1,x[:,1] is pc2 while x[:,2] is pc3
axis.scatter(x[:,0],x[:,1],x[:,2], c=data['target'],cmap='plasma')
axis.set_xlabel("PC1", fontsize=10)
axis.set_ylabel("PC2", fontsize=10)
axis.set_zlabel("PC3", fontsize=10)

-----------------------------------------------------------------
week -4 Write a python program to demonstrate various data visualisation

# importing pandas package


import pandas as pd
# making data frame from csv file
data = pd.read_csv("employees.csv")
# Printing the first 10 to 24 rows of
# the data frame for visualization
data[10:25]
# importing pandas package
import pandas as pd
# making data frame from csv file
data = pd.read_csv("employees.csv")
# Printing the first 10 to 24 rows of
# the data frame for visualization
data[10:25]
# importing pandas package
import pandas as pd
# making data frame from csv file
data = pd.read_csv("employees.csv")
# will replace Nan value in dataframe with value -99
data.replace(to_replace = np.nan, value = -99)
# importing pandas as pd
import pandas as pd
# Creating the dataframe

df = pd.DataFrame({"A":[12, 4, 5, None, 1],


"B":[None, 2, 54, 3, None],
"C":[20, 16, None, 3, 8],
"D":[14, 3, None, None, 6]})
# Print the dataframe
Df
# importing the required module
import matplotlib.pyplot as plt
# x axis values
x = [1,2,3]
# corresponding y axis values
y = [2,4,1]
# plotting the points
plt.plot(x, y)
# naming the x axis
plt.xlabel('x - axis')
# naming the y axis
plt.ylabel('y - axis')
# giving a title to my graph
plt.title('My first graph!')
# function to show the plot
plt.show()

return probabilities
def predict(info, test):
probabilities = calculateClassProbabilities(info, test)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(info, test):
predictions = []
for i in range(len(test)):
result = predict(info, test[i])
predictions.append(result)
return predictions
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0
filename = r'E:\user\MACHINE LEARNING\machine learning algos\Naive bayes\
filedata.csv'
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
mydata[i] = [float(x) for x in mydata[i]]
ratio = 0.7

train_data, test_data = splitting(mydata, ratio)


print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))
print("Test examples are: ", len(test_data))
info = MeanAndStdDevForClass(train_data)
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)
1. Implementation of SVM Classification
# importing scikit learn with make_blobs
from sklearn.datasets.samples_generator import make_blobs
# creating datasets X containing n_samples
# Y containing two classes
X, Y = make_blobs(n_samples=500, centers=2,random_state=0, cluster_std=0.40)
import matplotlib.pyplot as plt
# plotting scatters
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring');
plt.show()
# creating linspace between -1 to 3.5
xfit = np.linspace(-1, 3.5)
# plotting scatter
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring')
# plot a line between the different sets of data
for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
color='#AAAAAA', alpha=0.4)
plt.xlim(-1, 3.5);
plt.show()

# importing required libraries


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = pd.read_csv("C:\...\cancer.csv")
a = np.array(x)
y = a[:,30] # classes having 0 and 1
x = np.column_stack((x.malignant,x.benign))
x.shape
print (x),(y)

----------------------------------------------------------------------------------

WEEK-5: Supervised Learning


1. Implementation of Linear Regression

import numpy as np
import matplotlib.pyplot as plt
def estimate_coef(x, y):
n = np.size(x)
m_x = np.mean(x)
m_y = np.mean(y)
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",marker = "o", s = 30)
y_pred = b[0] + b[1]*x
plt.plot(x, y_pred, color = "g")
plt.xlabel('x')
plt.ylabel('y')
plt.show()
def main():
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {}\\nb_1 = {}".format(b[0], b[1]))
plot_regression_line(x, y, b)
if __name__ == "__main__":

-----------------------------------------------------------------------------------
--------------------------------------------------------------------------

WEEK-6 : Implementation of Logistic regression

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings( "ignore" )
class LogitRegression() :
def __init ( self, learning_rate, iterations ) :
self.learning_rate = learning_rate
self.iterations = iterations
def fit( self, X, Y ) :
self.m, self.n = X.shape
self.W = np.zeros( self.n )
self.b = 0
self.X = X
self.Y = Y
for i in range( self.iterations ) :
self.update_weights()
return self
def update_weights( self ) :
A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
tmp = ( A - self.Y.T )
tmp = np.reshape( tmp, self.m )
dW = np.dot( self.X.T, tmp ) / self.m
db = np.sum( tmp ) / self.m
self.W = self.W - self.learning_rate * dW
self.b = self.b - self.learning_rate * db
return self
def predict( self, X ) :
Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )
Y = np.where( Z > 0.5, 1, 0 )
return Y
def main() :
df = pd.read_csv( "diabetes.csv" )
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1:].values
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = 1/3, random_state = 0 )
model = LogitRegression( learning_rate = 0.01, iterations = 1000 )
model.fit( X_train, Y_train )
model1 = LogisticRegression()
model1.fit( X_train, Y_train)
Y_pred = model.predict( X_test )
Y_pred1 = model1.predict( X_test )
correctly_classified = 0
correctly_classified1 = 0
count = 0
for count in range( np.size( Y_pred ) ) :
if Y_test[count] == Y_pred[count] :
correctly_classified = correctly_classified + 1
if Y_test[count] == Y_pred1[count] :
correctly_classified1 = correctly_classified1 + 1
count = count + 1
print( "Accuracy on test set by our model : ", (
correctly_classified / count ) * 100 )
print( "Accuracy on test set by sklearn model : ", (
correctly_classified1 / count ) * 100 )
if __name__ == "__main__" :
main()
# importing pandas package
import pandas as pd

# making data frame from csv file


data = pd.read_csv("employees.csv")
# Printing the first 10 to 24 rows of
# the data frame for visualization
data[10:25]

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--

WEEK-7: Supervised Learning


1. Implementation of Decision tree classification

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def importdata():
balance_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-
'+'databases/balance-scale/balance-scale.data',sep= ',', header = None)
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print ("Dataset: ",balance_data.head())
return balance_data
def splitdataset(balance_data):
X = balance_data.values[:, 1:5]
Y = balance_data.values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
def train_using_gini(X_train, X_test, y_train):
clf_gini = DecisionTreeClassifier(criterion = "gini",random_state =
100,max_depth=3,
min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
return clf_gini
def tarin_using_entropy(X_train, X_test, y_train):
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state =
100,max_depth = 3,
min_samples_leaf = 5)
clf_entropy.fit(X_train, y_train)

return clf_entropy
def prediction(X_test, clf_object):
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))print ("Accuracy :
",accuracy_score(y_test,y_pred)*100)
print("Report : ",
classification_report(y_test, y_pred))
def main():
data = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
if __name__=="__main__":
main()
1. Implementation of K-nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt
y = irisData.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,


random_state=42)neighbors
= np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

-----------------------------------------------------------------------------------
------------------------------------------

WEEK-8
Implementation of Naïve Bayes classifier algorithm

import math
import random
import csv
def encode_class(mydata):
classes = []
for i in range(len(mydata)):
if mydata[i][-1] not in classes:
classes.append(mydata[i][-1])
for i in range(len(classes)):
for j in range(len(mydata)):
if mydata[j][-1] == classes[i]:
mydata[j][-1] = i
return mydata
def splitting(mydata, ratio):
train_num = int(len(mydata) * ratio)
train = []
test = list(mydata)
while len(train) < train_num:
index = random.randrange(len(test))
train.append(test.pop(index))
return train, test
def groupUnderClass(mydata):
dict = {}
for i in range(len(mydata)):
if (mydata[i][-1] not in dict):
dict[mydata[i][-1]] = []
dict[mydata[i][-1]].append(mydata[i])
return dict

return sum(numbers) / float(len(numbers))


def std_dev(numbers):
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
def MeanAndStdDev(mydata):
info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
del info[-1]
return info
def MeanAndStdDevForClass(mydata):
info = {}
dict = groupUnderClass(mydata)
for classValue, instances in dict.items():
info[classValue] = MeanAndStdDev(instances)
return info
def calculateGaussianProbability(x, mean, stdev):
expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo
def calculateClassProbabilities(info, test):
probabilities = {}
for classValue, classSummaries in info.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, std_dev = classSummaries[i]
x = test[i]
probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--
Week-9: Implementation of K-nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt
y = irisData.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state=42)
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-------

WEEK-10: Build Artificial Neural Network model with back propagation

Let’s first understand the term neural networks. In a neural network, where neurons
are
fed inputs which then neurons consider the weighted sum over them and pass it by an
activation function and passes out the output to next neuron.

Python: To run our script


Pip: Necessary to install Python packages
pip install tensorflow
pip install keras
# Importing libraries
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence# Our dictionary will contain
only of the top 7000 words appearing most frequently
top_words = 7000# Now we split our data-set into training and test data
(X_train, y_train), (X_test, y_test) =
imdb.load_data(num_words=top_words)# Looking at the nature of training
data
print(X_train[0])
print(y_train[0])print('Shape of training data: ')
print(X_train.shape)
print(y_train.shape)print('Shape of test data: ')
print(X_test.shape)
print(y_test.shape)

# Padding the data samples to a maximum review length in words


max_words = 450X_train = sequence.pad_sequences(X_train,
maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)# Building the
CNN Model
model = Sequential() # initilaizing the Sequential nature for CNN
model# Adding the embedding layer which will take in maximum of 450
words as input and provide a 32 dimensional output of those words which
belong in the top_words dictionary

model.add(Embedding(top_words, 32, input_length=max_words))


model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.summary()

-----------------------------------------------------------------------------------
----------------------------------------------------------------------------
WEEK-11
Implementing Random Forest
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv('Salaries.csv')
print(data)
# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor
# create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
# fit the regressor with x and y data
regressor.fit(x, y)
Y_pred = regressor.predict(np.array([6.5]).reshape(1, 1)) # test the output by
changing values
# Visualising the Random Forest Regression results
# arrange for creating a range of values
# from min value of x to max
# value of x with a difference of 0.01
# between two consecutive values
X_grid = np.arrange(min(x), max(x), 0.01)
# reshape for reshaping the data into a len(X_grid)*1 array,
# i.e. to make a column out of the X_grid value
X_grid = X_grid.reshape((len(X_grid), 1))
# Scatter plot for original data
plt.scatter(x, y, color = 'blue')
# plot predicted data
plt.plot(X_grid, regressor.predict(X_grid),color = 'green')
plt.title('Random Forest Regression')
plt.xlabel('Position level') plt.ylabel('Salary')

------------------------------------------------------

WEEK-11(B) : Model Selection, Bagging and Boosting


1. Cross Validation
# This code may not be run on GFG IDE
# as required packages are not found.
# importing cross-validation from sklearn package.from sklearn import
cross_validation
# value of K is 10.
data = cross_validation.KFold(len(train_set), n_folds=10, indices=False)
2. Implementing AdaBoost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings("ignore")
# Reading the dataset from the csv file
# separator is a vertical line, as seen in the dataset
data = pd.read_csv("Iris.csv")
# Printing the shape of the dataset
print(data.shape)
data = data.drop('Id',axis=1)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
print("Shape of X is %s and shape of y is %s"%(X.shape,y.shape))
total_classes = y.nunique()
print("Number of unique species in dataset are: ",total_classes)
distribution = y.value_counts()
print(distribution)
X_train,X_val,Y_train,Y_val = train_test_split(X,y,test_size=0.25,random_state=28)
print("The accuracy of the model on validation set is",
adb_model.score(X_val,Y_val))

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--WEEK-12: Unsupervised Learning
Implementing K-means Clustering

def ReadData(fileName):
# Read the file, splitting by lines
f = open(fileName, 'r');
lines = f.read().splitlines();
f.close();
items = [];
for i in range(1, len(lines)):
line = lines[i].split(',');
itemFeatures = [];
for j in range(len(line)-1):
# Convert feature value to float
v = float(line[j]);
# Add feature value to dict
itemFeatures.append(v);
items.append(itemFeatures);
shuffle(items);
return items;
def FindColMinMax(items):n
= len(items[0]);
minima = [sys.maxint for i in range(n)];
maxima = [-sys.maxint -1 for i in range(n)];
for item in items:
for f in range(len(item)):
if (item[f] < minima[f]):
minima[f] = item[f];
if (item[f] > maxima[f]):
maxima[f] = item[f];
return minima,maxima;

def InitializeMeans(items, k, cMin, cMax):


# Initialize means to random numbers between
# the min and max of each column/feature
f = len(items[0]); # number of features
means = [[0 for i in range(f)] for j in range(k)];
for mean in means:
for i in range(len(mean)):
# Set value to a random float
# (adding +-1 to avoid a wide placement of a mean)
mean[i] = uniform(cMin[i]+1, cMax[i]-1);
return means;
def EuclideanDistance(x, y):
S = 0; # The sum of the squared differences of the elements
for i in range(len(x)):
S += math.pow(x[i]-y[i], 2)
#The square root of the sum
return math.sqrt(S)
def UpdateMean(n,mean,item):
for i in range(len(mean)):
m = mean[i];
m = (m*(n-1)+item[i])/float(n);
mean[i] = round(m, 3);
return mean;
def Classify(means,item):
# Classify item to the mean with minimum distance
minimum = sys.maxint;
index = -1;
for i in range(len(means)):
# Find distance from item to mean

dis = EuclideanDistance(item, means[i]);


if (dis < minimum):
minimum = dis;
index = i;
return index;
def CalculateMeans(k,items,maxIterations=100000):
# Find the minima and maxima for columns
cMin, cMax = FindColMinMax(items);
# Initialize means at random points
means = InitializeMeans(items,k,cMin,cMax);
# Initialize clusters, the array to hold
# the number of items in a class
clusterSizes= [0 for i in range(len(means))];
# An array to hold the cluster an item is in
belongsTo = [0 for i in range(len(items))];
# Calculate means
for e in range(maxIterations):
# If no change of cluster occurs, halt
noChange = True;
for i in range(len(items)):
item = items[i];
# Classify item into a cluster and update the
# corresponding means.
index = Classify(means,item);
clusterSizes[index] += 1;
cSize = clusterSizes[index];
means[index] = UpdateMean(cSize,means[index],item);
# Item changed cluster
if(index != belongsTo[i]):
noChange = False;
belongsTo[i] = index;

# Nothing changed, return


if (noChange):
break;
return means;
def FindClusters(means,items):
clusters = [[] for i in range(len(means))]; # Init clusters
for item in items:
# Classify item into a cluster
index = Classify(means,item);
# Add item to cluster
clusters[index].append(item);
return clusters;

-----------------------------------------------------------------------------------
------------------

hypothesis

import math
import csv
def load_csv(filename):
lines=csv.reader(open(filename,"r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset,headers
class Node:
def init (self,attribute):
self.attribute=attribute
self.children=[]
self.answer=""
def subtables(data,col,delete):
dic={}
coldata=[row[col] for row in data]
attr=list(set(coldata))
counts=[0]*len(attr)
r=len(data)
c=len(data[0])
for x in range(len(attr)):
for y in range(r):
if data[y][col]==attr[x]:
counts[x]+=1
for x in range(len(attr)):
dic[attr[x]]=[[0 for i in range(c)] for j in
range(counts[x])]
pos=0
for y in range(r):
if data[y][col]==attr[x]:
if delete:
del data[y][col]
dic[attr[x]][pos]=data[y]
pos+=1
return attr,dic
def entropy(S):
attr=list(set(S))
if len(attr)==1:
return 0
counts=[0,0]
for i in range(2):
counts[i]=sum([1 for x in S if attr[i]==x])/(len(S)*1.0)
sums=0
for cnt in counts:
sums+=-1*cnt*math.log(cnt,2)
return sums
def compute_gain(data,col):
attr,dic = subtables(data,col,delete=False)
total_size=len(data)
entropies=[0]*len(attr)
ratio=[0]*len(attr)
total_entropy=entropy([row[-1] for row in data])
for x in range(len(attr)):
ratio[x]=len(dic[attr[x]])/(total_size*1.0)
entropies[x]=entropy([row[-1] for row in
dic[attr[x]]])
total_entropy-=ratio[x]*entropies[x]
return total_entropy
def build_tree(data,features):
lastcol=[row[-1] for row in data]
if(len(set(lastcol)))==1:
node=Node("")
node.answer=lastcol[0]
return node
n=len(data[0])-1
gains=[0]*n
for col in range(n):
gains[col]=compute_gain(data,col)
split=gains.index(max(gains))
node=Node(features[split])
fea = features[:split]+features[split+1:]
attr,dic=subtables(data,split,delete=True)
for x in range(len(attr)):
child=build_tree(dic[attr[x]],fea)
node.children.append((attr[x],child))
return node
def print_tree(node,level):
if node.answer!="":
print(" "*level,node.answer)
return
print(" "*level,node.attribute)
for value,n in node.children:
print(" "*(level+1),value)
print_tree(n,level+2)
def classify(node,x_test,features):
if node.answer!="":
print(node.answer)
return
pos=features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)
'''Main program'''
dataset,features=load_csv("data3.csv")
node1=build_tree(dataset,features)
print("The decision tree for the dataset using ID3 algorithm
is")
print_tree(node1,0)
testdata,features=load_csv("data3_test.csv")
for xtest in testdata:
print("The test instance:",xtest)
print("The label for test instance:",end=" ")
classify(node1,xtest,features)

-----------------------------------------------------------------------

import pandas as pd
msg=pd.read_csv('naivetext.csv',names=['message','label'])
print('The dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
print(X)
print(y)
#splitting the dataset into train and test data
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
print ('\n The total number of Training Data :',ytrain.shape)
print ('\n The total number of Test Data :',ytest.shape)
#output of count vectoriser is a sparse matrix
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
print('\n The words or Tokens in the text documents \n')
print(count_vect.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_fe
ature_names())
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
#printing accuracy, Confusion matrix, Precision and Recall
from sklearn import metrics
print('\n Accuracy of the classifer is’,
metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision' ,
metrics.precision_score(ytest,predicted))
print('\n The value of Recall' ,
metrics.recall_score(ytest,predicted))

-----------------------------------------------------------------------------------
---------------

10. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data setfor
clustering using k-Means algorithm. Compare the results of these two algorithms and
comment on the quality of clustering. You can add Python ML library classes/APIin
the
program.
SOURCE CODE:
import matplotlib.pyplot as plt from sklearn
import datasets
from sklearn.cluster import KMeans import
sklearn.metrics as sm
import pandas as pd import numpy as np
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
X.columns =
['Sepal_Length','Sepal_Width','Petal_Length',
'Petal_Width']
y = pd.DataFrame(iris.target) y.columns =
['Targets']
model = KMeans(n_clusters=3) model.fit(X)
plt.figure(figsize=(14,7))
colormap = np.array(['red', 'lime', 'black'])
# Plot the Original Classifications
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width,
c=colormap[y.Targets], s=40) plt.title('Real
Classification')
plt.xlabel('Petal Length') plt.ylabel('Petal
Width')
# Plot the Models Classifications
plt.subplot(1, 2, 2)
plt.scatter(X.Petal_Length, X.Petal_Width,
c=colormap[model.labels_], s=40) plt.title('K
Mean Classification')
plt.xlabel('Petal Length') plt.ylabel('Petal
Width')
print('The accuracy score of K-Mean:
',sm.accuracy_score(y, model.labels_))
print('The Confusion matrixof K-Mean:
',sm.confusion_matrix(y, model.labels_))
from sklearn import preprocessing scaler =
preprocessing.StandardScaler() scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
#xs.sample(5)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
y_gmm = gmm.predict(xs) #y_cluster_gmm
plt.subplot(2, 2, 3)
plt.scatter(X.Petal_Length, X.Petal_Width,
c=colormap[y_gmm], s=40) plt.title('GMM
Classification')
plt.xlabel('Petal Length') plt.ylabel('Petal
Width')
print('The accuracy score of EM:
',sm.accuracy_score(y, y_gmm)) print('The
Confusion matrix of EM:
',sm.confusion_matrix(y, y_gmm))

-----------------------------------------------------------------------------------
----------
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets
iris=datasets.load_iris()
x = iris.data
y = iris.target
print ('sepal-length', 'sepal-width', 'petal-length', 'petal-width')
print(x)
print('class: 0-Iris-Setosa, 1- Iris-Versicolour, 2- Iris-Virginica')
print(y)
x_train, x_test, y_train, y_test =
train_test_split(x,y,test_size=0.3)
#To Training the model and Nearest nighbors K=5
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)
#to make predictions on our test data
y_pred=classifier.predict(x_test)
print('Confusion Matrix')
print(confusion_matrix(y_test,y_pred))
print('Accuracy Metrics')
print(classification_report(y_test,y_pred))

--------------------------------------------------------------

obtain the line of regression of the datasets

import numpy as np
import pandas as pd
import numpy as genfromtxt
from sklearn import linear_model
from sklearn.model_selection import train_test_split,cross_val_score
from matplotlib import pyplot as plt
from matplotlib import colors
df=pd.read_csv("data_for_regression_coefficient.csv")

df.head(10)

x=df['x'].values.reshape(df['x'].count(),1)
y=df['y'].values.reshape(df['y'].count(),1)
print(x.shape)
print(y.shape)

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=9892
)

simp_reg=linear_model.LinearRegression(fit_intercept=True) #create a scikit linear


regression model object
simp_reg.fit(x,y) #train the model on the training dataset

linear_model

print('coefficient : \n'+ str(simp_reg.coef_))


print('Intercept:\n' +str(simp_reg.intercept_))
#the mean squared error
print('Mean Squared Error: '+str(np.mean((simp_reg.predict(X_test)-y_test)**2)))
print('Variance: '+str(simp_reg.score(X_test,y_test)))

plt.plot(x, y, 'ro') # scatter plot showing actual data


plt.plot(x, y) # regression line
plt.title('Actual vs Predicted')
plt.xlabel('X')
plt.ylabel('y')
plt.show()

-------------------------
question to find the line of regression and estimate the height of the son when the
height of the father is 164.

data=pd.read_csv('heights_of_father_and_son.csv')

x=data['Height of fathers'].values.reshape(data['Height of fathers'].count(),1)


y=data['Height of sons'].values.reshape(data['Height of sons'].count(),1)
print(x.shape)
print(y.shape)

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=9892
)

simp_reg=linear_model.LinearRegression(fit_intercept=True) #create a scikit linear


regression model object
simp_reg.fit(x,y) #train the model on the training dataset

print('coefficient : \n'+ str(simp_reg.coef_))


print('Intercept:\n' +str(simp_reg.intercept_))
#the mean squared error
print('Mean Squared Error: '+str(np.mean((simp_reg.predict(X_test)-y_test)**2)))
print('Variance Score: '+str(simp_reg.score(X_test,y_test)))

y=0.61023622*164+66.11417323

------------------------------------------------------------------------

pca

import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Generate some example data


np.random.seed(42)
data = np.random.rand(100, 2) # 100 samples, 2 features

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data)

# Explained variance ratio


explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# Plot the original data


plt.scatter(data[:, 0], data[:, 1], alpha=0.7, label='Original Data')

# Plot the transformed data after PCA


plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7, label='PCA Result')
# Set labels and title
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('PCA Example')
# Add legend
plt.legend()

# Show the plot


plt.show()

import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Assuming you have a dataset X, replace the following line with your data
# For example, X = np.random.rand(100, 3) # 100 samples, 3 features

# Create a sample dataset


np.random.seed(42)
X = np.random.rand(100, 2) # 100 samples, 2 features

# Step 1: Standardize the data


scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Step 2: Compute the covariance matrix


cov_matrix = np.cov(X_std, rowvar=False)

# Step 3: Compute eigenvectors and eigenvalues


eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Step 4: Sort eigenvalues and corresponding eigenvectors


sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Step 5: Choose the number of components


# For example, let's keep the first two principal components
num_components = 2
selected_components = eigenvectors[:, :num_components]

# Step 6: Project the data onto the selected components


X_pca = np.dot(X_std, selected_components)

# Plot the original data and the data after PCA


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_std[:, 0], X_std[:, 1])
plt.title('Original Data')
plt.xlabel('Standardized Feature 1')
plt.ylabel('Standardized Feature 2')

#plt.subplot(1, 2, 2)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
import pandas as pd

# Load the MNIST dataset (you can replace this with your own image dataset)
mnist = fetch_openml('mnist_784')
images = mnist.data.astype(float)
labels = mnist.target.astype(int)

# Standardize the pixel values


images /= 255.0

# Reshape images to 28x28 (assuming MNIST-like images)


images = images.Value.reshape(-1, 28, 28)

# Flatten each image into a 1D array


flattened_images = images.reshape(images.shape[0], -1)

# Convert to DataFrame
df = pd.DataFrame(flattened_images)

# Apply PCA
n_components = 50 # You can adjust this number based on your requirements
pca = PCA(n_components=n_components)
flattened_images_pca = pca.fit_transform(df.values)
# Visualize original and PCA-transformed images
fig, axes = plt.subplots(2, 10, figsize=(10, 2))

for i in range(10):
axes[0, i].imshow(flattened_images[i].reshape(28, 28), cmap='gray')
axes[0, i].axis('off')
axes[1, i].imshow(pca.inverse_transform(flattened_images_pca[i]).reshape(28,
28), cmap='gray')
axes[1, i].axis('off')

axes[0, 0].set_title('Original Images')


axes[1, 0].set_title('PCA Transformed Images')

plt.show()

You might also like