0% found this document useful (0 votes)
13 views41 pages

ML All Projectpdf Removed

The document outlines a series of practical tasks related to machine learning, including Linear Regression, Naive Bayes Classification, K-Nearest Neighbors, and more. Each experiment includes code snippets for implementation, data handling, and model evaluation using various datasets. The document emphasizes techniques such as regularization, cross-validation, and the Elbow Method for optimizing model parameters.

Uploaded by

ujjwalkumar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views41 pages

ML All Projectpdf Removed

The document outlines a series of practical tasks related to machine learning, including Linear Regression, Naive Bayes Classification, K-Nearest Neighbors, and more. Each experiment includes code snippets for implementation, data handling, and model evaluation using various datasets. The document emphasizes techniques such as regularization, cross-validation, and the Elbow Method for optimizing model parameters.

Uploaded by

ujjwalkumar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 41

LIST OF PRACTICALS

S.NO. PRACTICAL TASK PAGE NO.


1. LINEAR REGRESSION. 3

2. NAIVE BAYES CLASSIFICATION. 8

3. K-NEAREST NEIGHBORS (KNN) WITH 13


ELBOW METHOD.
4. LOGISTIC REGRESSION. 18
5. PRINCIPAL COMPONENT ANALYSIS 24
(PCA).
6. K MEANS CLUSTERING. 28

7. ARTIFICIAL NEURAL NETWORK (ANN) 32


WITH ONE HIDDEN LAYER.

8. CONVOLUTION NEURAL NETWORK 34


(CNN) FOR OBJECT DETECTION.

2
EXPERIMENT 1 :- Implement Linear Regression
Model with Regularization & Cross Validation.
In This Case, We Have Taken 2 Combined Datasets (2023 - 24) :-
1. We take student performance dataset to know about whether they
will pass the exam or fail
2. Student unemployment rate dataset
Data Has Been Officially Collected From Kaggle.
Now, We Will Plot This Data With 1st Dataset On X Axis And 2nd Dataset
On Y Axis.

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.metrics import r2_score

from google.colab import files

df = pd.read_csv('StudentsPerformance.csv')

df = pd.get_dummies(df, columns=['test preparation course'])

X = df[['reading score', 'writing score']]

y = df['math score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression().fit(X_train, y_train)

y_pred = lr.predict(X_test)

plt.figure(figsize=(7,5))

plt.scatter(y_test, y_pred, c=X_test['writing score'], cmap='viridis', alpha=0.7)

3
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)

plt.title('Actual vs Predicted Math Scores\n(Linear Regression)')

plt.xlabel('Actual Scores')

plt.ylabel('Predicted Scores')

plt.colorbar(label='Writing Score')

plt.grid(True)

plt.show()

alphas = np.logspace(-3, 3, 50)

models = {'Ridge': Ridge, 'Lasso': Lasso}

kfold = KFold(n_splits=5, shu le=True)

plt.figure(figsize=(10,5))

for name, model in models.items():

cv_scores = []

for alpha in alphas:

m = model(alpha=alpha)

scores = []

for train_idx, val_idx in kfold.split(X_train):

X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]

y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

m.fit(X_tr, y_tr)

scores.append(r2_score(y_val, m.predict(X_val)))

cv_scores.append(np.mean(scores))

plt.semilogx(alphas, cv_scores, label=f'{name} CV R²')

plt.scatter([10 if name=='Ridge' else 0.1][0], np.max(cv_scores),

color='red', zorder=5)

plt.title('Regularization Parameter Tuning')

plt.xlabel('Alpha (log scale)')

plt.ylabel('Cross-Validated R² Score')

plt.legend()

4
plt.grid(True)

plt.show()

models = {

'Linear Regression': LinearRegression(),

'Ridge (α=10)': Ridge(alpha=10),

'Lasso (α=0.1)': Lasso(alpha=0.1)

results = []

for name, model in models.items():

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cv_scores = []

for train_idx, val_idx in kfold.split(X_train):

X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]

y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

model.fit(X_tr, y_tr)

cv_scores.append(r2_score(y_val, model.predict(X_val)))

results.append({

'Model': name,

'CV Mean R²': np.mean(cv_scores),

'CV Std': np.std(cv_scores),

'Test R²': r2_score(y_test, y_pred)

})

print("\nFinal Model Performance:")

print(pd.DataFrame(results).round(3).to_string(index=False))

5
Implement Elastic Net Regularization, Cross Validation And Then
Comparing Adjusted R2 Values :-
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV

from sklearn.model_selection import KFold, cross_val_score

from sklearn.metrics import r2_score

from google.colab import files

print("Please Upload Excel file:")

uploaded = files.upload()

file_name = list(uploaded.keys())[0]

6
data = pd.ExcelFile(file_name)

sheet_name = "UnemploymentRate(23-24)"

df = data.parse(sheet_name)

data_for_analysis = df[['State/UT-Wise', 'Open', 'Women']]

scaler = MinMaxScaler()

normalized = scaler.fit_transform(data_for_analysis[['Open', 'Women']])

ndf = pd.DataFrame(normalized, columns=['Open', 'Women'])

ndf['State/UT-Wise'] = data_for_analysis['State/UT-Wise']

X = ndf[['Open']].values

y = ndf['Women'].values

n, p = X.shape

# Regularisation.

models = {

'OLS': LinearRegression(),

'Ridge': RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5),

'Lasso': LassoCV(alphas=[0.01, 0.1, 1.0], cv=5, max_iter=10000),

'ElasticNet': ElasticNetCV(alphas=[0.01, 0.1, 1.0], l1_ratio=[0.2, 0.5, 0.8], cv=5, max_iter=10000)

# Cross Validation Using Simple K Fold Method.

def adjusted_r2(r2, n, p):

return 1 - (1 - r2) * (n - 1) / (n - p - 1)

results = []

kf = KFold(n_splits = 5, shu le = True, random_state = 42)

for name, model in models.items():

cv_r2 = cross_val_score(model, X, y, scoring='r2', cv = kf)

model.fit(X, y)

y_pred = model.predict(X)

full_r2 = r2_score(y, y_pred)

full_adj_r2 = adjusted_r2(full_r2, n, p)

cv_adj = adjusted_r2(np.mean(cv_r2), n, p)

7
results.append({

'Model': name,

'Mean CV R2': np.mean(cv_r2),

'Full R2': full_r2,

'Full Adj R2': full_adj_r2,

'Cross Validated Adj R2': cv_adj

})

res_df = pd.DataFrame(results)

print(res_df)

res_df.set_index('Model')[['Full Adj R2', 'Cross Validated Adj R2']].plot(kind = 'bar', figsize=(8, 5))

plt.ylabel('Adjusted R2')

plt.title('Adjusted R2: Full v/s CV for Di erent Models')

plt.grid(True)

plt.show()

8
EXPERIMENT 2 :- Develop Naïve Bayes
Classification From Scratch With Laplace
Smoothing.
In this case, we have taken A standard Dataset Adult dataset widely used
for ML Training and Testing Model.
import numpy as np

import pandas as pd

class NaiveBayesClassifier:

def __init__(self, alpha=1.0):

self.alpha = alpha # Laplace smoothing parameter

def fit(self, X, y):

self.classes = np.unique(y)

self.priors = {c: np.mean(y == c) for c in self.classes}

self.conditional_probs = {}

for c in self.classes:

X_c = X[y == c]

self.conditional_probs[c] = {

col: (X_c[col].value_counts() + self.alpha) / (len(X_c) + self.alpha * len(X[col].unique()))

for col in X.columns

def predict(self, X):

predictions = []

for _, x in X.iterrows():

posteriors = {}

for c in self.classes:

prior = np.log(self.priors[c])

conditional = sum(np.log(self.conditional_probs[c][col].get(x[col], self.alpha / (len(X) +


self.alpha * len(X[col].unique())))) for col in X.columns)

9
posteriors[c] = prior + conditional

predictions.append(max(posteriors, key=posteriors.get))

return np.array(predictions)

for x in X:

posteriors = {}

for c in self.classes:

prior = np.log(self.priorClass[c])

likelihood = 0

for featureIndex, value in enumerate(x):

mean, var = self.likelihoods[c][featureIndex]

likelihood += np.log(self.GaussFunction(value, mean, var) + self.alpha)

posteriors[c] = prior + likelihood

predictions.append(max(posteriors, key = posteriors.get))

return np.array(predictions)

import pandas as pd

column_names = [

"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",

"occupation", "relationship", "race", "sex", "capital-gain",

"capital-loss", "hours-per-week", "native-country", "income"

df = pd.read_csv('adult.data', header=None, names=column_names, na_values=' ?')

df.dropna(inplace=True)

prior_probabilities = df['income'].value_counts(normalize=True)

print("Prior Probabilities:\n", prior_probabilities)

10
conditional_probabilities =
df.groupby('income')['education'].value_counts(normalize=True).unstack()

print("Conditional Probabilities of Education given Income:\n", conditional_probabilities)

# train.py

import pandas as pd

import numpy as np

import streamlit as st

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import CategoricalNB

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('adult.data', header=None, na_values=' ?')

df.columns = [

11
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",

"occupation", "relationship", "race", "sex", "capital-gain",

"capital-loss", "hours-per-week", "native-country", "income"

df.dropna(inplace=True)

categorical_cols = df.select_dtypes(include='object').columns

label_encoders = {}

for col in categorical_cols:

le = LabelEncoder()

df[col] = le.fit_transform(df[col])

label_encoders[col] = le

X = df.drop('income', axis=1)

y = df['income']

feature_names = X.columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Categorical Naive Bayes

model = CategoricalNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

st.title("Income Prediction App (Adult Dataset)")

st.sidebar.header("Enter input values")

12
input_data = []

for feature in feature_names:

if feature in categorical_cols:

options = label_encoders[feature].classes_

value = st.sidebar.selectbox(f"{feature}", options)

encoded_value = label_encoders[feature].transform([value])[0]

else:

value = st.sidebar.number_input(f"{feature}", min_value=0)

encoded_value = value

input_data.append(encoded_value)

input_df = pd.DataFrame([input_data], columns=feature_names)

prediction = model.predict(input_df)[0]

prediction_proba = model.predict_proba(input_df)[0]

st.subheader('Prediction')

st.write("Predicted Income Class:", label_encoders['income'].inverse_transform([prediction])[0])

st.write("Probability:", prediction_proba)

13
EXPERIMENT 3 :- Implement KNN with Elbow
Method to determine optimal K. Also, try Euclidean
and Hamming Distances.
We took a standard dataset which is diabetis dataset.
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (accuracy_score, confusion_matrix,

classification_report, f1_score)

from sklearn.model_selection import cross_val_score

# Load the dataset

df = pd.read_csv('diabetes datset.csv')

# Handle missing values (replace zeros with NaN for relevant columns)

zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df[zero_cols] = df[zero_cols].replace(0, np.nan)

for col in zero_cols:

df[col] = df[col].fillna(df[col].median())

# Split into features and target

X = df.drop('Outcome', axis=1)

y = df['Outcome']

14
# Standardize the features (important for KNN)

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Split into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

Graph of hamming and Euclidean distances

# Function to plot elbow curve

def plot_elbow_curve(X_train, y_train, max_k=30, distance_metric='euclidean'):

error_rates = []

for k in range(1, max_k+1):

knn = KNeighborsClassifier(n_neighbors=k, metric=distance_metric)

cv_scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')

error_rate = 1 - cv_scores.mean()

error_rates.append(error_rate)

plt.figure(figsize=(10, 6))

plt.plot(range(1, max_k+1), error_rates, marker='o')

plt.title(f'Error Rate vs. K Value ({distance_metric.capitalize()} Distance)')

plt.xlabel('K')

plt.ylabel('Error Rate')

plt.grid()

plt.show()

return error_rates

# Plot elbow curve for Euclidean distance

euclidean_errors = plot_elbow_curve(X_train, y_train, distance_metric='euclidean')

# Plot elbow curve for Hamming distance

hamming_errors = plot_elbow_curve(X_train, y_train, distance_metric='hamming')

15
Hamming distance

Optimal k value for both distances

optimal_k_euclidean = np.argmin(euclidean_errors) + 1 # +1 because range starts at 1

16
print(f"Optimal K for Euclidean distance: {optimal_k_euclidean}")

# Find optimal K for Hamming distance

optimal_k_hamming = np.argmin(hamming_errors) + 1

print(f"Optimal K for Hamming distance: {optimal_k_hamming}")

def evaluate_knn(X_train, y_train, X_test, y_test, k, distance_metric):

# Create and train KNN model

knn = KNeighborsClassifier(n_neighbors=k, metric=distance_metric)

knn.fit(X_train, y_train)

# Make predictions

y_pred = knn.predict(X_test)

# Evaluate performance

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

conf_matrix = confusion_matrix(y_test, y_pred)

class_report = classification_report(y_test, y_pred)

print(f"\nKNN Performance (k={k}, {distance_metric} distance):")

print(f"Accuracy: {accuracy:.3f}")

print(f"F1 Score: {f1:.3f}")

print("\nConfusion Matrix:")

print(conf_matrix)

print("\nClassification Report:")

print(class_report)

17
return knn, accuracy, f1

# Evaluate Euclidean distance model

knn_euclidean, acc_euclidean, f1_euclidean = evaluate_knn(

X_train, y_train, X_test, y_test,

k=optimal_k_euclidean,

distance_metric='euclidean'

# Evaluate Hamming distance model

knn_hamming, acc_hamming, f1_hamming = evaluate_knn(

X_train, y_train, X_test, y_test,

k=optimal_k_hamming,

Performance comparsion

18
EXPERIMENT 4 :- Implement Logistic Regression
and plot Impact of Variation in Threshold.
In this model, we have used dataset of diabetis that give good result in
logistic regression

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score

df = pd.read_csv('diabetes datset.csv')

zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df[zero_cols] = df[zero_cols].replace(0, np.nan)

19
for col in zero_cols:

df[col] = df[col].fillna(df[col].median())

X = df.drop('Outcome', axis=1)

y = df['Outcome']

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

X_scaled = np.hstack((np.ones((X_scaled.shape[0], 1)), X_scaled))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y.values, test_size=0.3, random_state=42)

def sigmoid(z):

return 1 / (1 + np.exp(-z))

def train_logistic(X, y, lr=0.01, epochs=1000):

weights = np.zeros(X.shape[1])

for epoch in range(epochs):

z = np.dot(X, weights)

predictions = sigmoid(z)

error = predictions - y

gradient = np.dot(X.T, error) / len(y)

weights -= lr * gradient

if epoch % 100 == 0:

loss = -np.mean(y * np.log(predictions + 1e-10) + (1 - y) * np.log(1 - predictions + 1e-10))

print(f"Epoch {epoch}, Loss: {loss:.4f}")

return weights

weights = train_logistic(X_train, y_train, lr=0.1, epochs=1000)

20
def predict(X, weights, threshold=0.58):

probs = sigmoid(np.dot(X, weights))

return (probs >= threshold).astype(int), probs

y_pred_custom, y_prob = predict(X_test, weights)

acc = accuracy_score(y_test, y_pred_custom)

print(f"\nCustom Logistic Regression Accuracy: {acc:.4f}")

results_df_30 = pd.DataFrame({

'Predicted Probability': y_prob[:30],

'Predicted Class (threshold = 0.58)': y_pred_custom[:30],

'Actual Class': y_test[:30]

})

print("\nSample Predictions:\n")

print(results_df_30)

plt.plot(results_df['threshold'], results_df['accuracy'], 'b-', marker='o', label='Accuracy')

plt.plot(results_df['threshold'], results_df['precision'], 'g-', marker='s', label='Precision')

plt.plot(results_df['threshold'], results_df['recall'], 'r-', marker='^', label='Recall')

plt.plot(results_df['threshold'], results_df['f1_score'], 'y-', marker='*', label='F1 Score')

plt.title('Impact Of Threshold Variation on Model Performance', fontsize=16)

plt.xlabel('Threshold', fontsize=14)

plt.ylabel('Score', fontsize=14)

plt.xticks(thresholds)

plt.legend(loc='best', fontsize=12)

plt.grid(True, alpha=0.3)

plt.show()

21
import matplotlib.pyplot as plt

import seaborn as sns

plt.figure(figsize=(12, 8))

for i, col in enumerate(X.columns):

plt.subplot(3, 3, i+1)

sns.kdeplot(df[col][df['Outcome'] == 0], label='No Diabetes', shade=True)

sns.kdeplot(df[col][df['Outcome'] == 1], label='Diabetes', shade=True)

plt.title(col)

plt.tight_layout()

22
plt.legend()

plt.show()

23
EXPERIMENT 5 :- Implement Principal
Component Analysis with Normalised Attributes
and Dimension Reduction.
We can create a synthetic dataset and then apply PCA on it. Here, we
used a generator for dataset with 5 features thus, leading to
dimensionality as 5.
# Apply PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)

# Plot explained variance ratio


plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Components')
plt.grid()
plt.show()

# Determine optimal number of components (e.g., capturing 95%


variance)
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_)
>= 0.95) + 1
print(f"Number of components explaining 95% variance:
{n_components}")

# Apply PCA with optimal components


pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

24
# Logistic Regression with PCA
log_reg_pca = LogisticRegression(max_iter=1000)
log_reg_pca.fit(X_train_pca, y_train)
y_pred_log_pca = log_reg_pca.predict(X_test_pca)
accuracy_log_pca = accuracy_score(y_test, y_pred_log_pca)
print(f"Logistic Regression with PCA Accuracy: {accuracy_log_pca:.4f}")

# KNN with PCA


knn_pca = KNeighborsClassifier(n_neighbors=5)
knn_pca.fit(X_train_pca, y_train)
y_pred_knn_pca = knn_pca.predict(X_test_pca)
accuracy_knn_pca = accuracy_score(y_test, y_pred_knn_pca)
print(f"KNN with PCA Accuracy: {accuracy_knn_pca:.4f}")

# Compare results

25
print("\nModel Comparison:")
print(f"Logistic Regression: {accuracy_log:.4f} ->
{accuracy_log_pca:.4f} with PCA")
print(f"KNN: {accuracy_knn:.4f} -> {accuracy_knn_pca:.4f} with PCA")

sklearn.preprocessing import LabelEncoder, StandardScaler

import numpy as np

# Drop non-numeric / text-heavy or identifier columns that are not useful for PCA

drop_cols = ['Youtuber', 'Title', 'Abbreviation', 'channel_type', 'Country',

'created_month', 'created_date', 'category']

df_cleaned = df.drop(columns=drop_cols)

# Handle missing values

for col in df_cleaned.select_dtypes(include=['float64', 'int64']).columns:

df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())

for col in df_cleaned.select_dtypes(include=['object']).columns:

df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])

le = LabelEncoder()

df_cleaned[col] = le.fit_transform(df_cleaned[col])

# Standardize the data

scaler = StandardScaler()

X_scaled = scaler.fit_transform(df_cleaned.drop(columns=['rank']))

# Covariance matrix

26
cov_matrix = np.cov(X_scaled.T)

# Eigen decomposition

eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort eigenvalues and corresponding eigenvectors

sorted_indices = np.argsort(eigenvalues)[::-1]

eigenvalues_sorted = eigenvalues[sorted_indices]

eigenvectors_sorted = eigenvectors[:, sorted_indices]

# Select top 2 components for visualization

top_components = eigenvectors_sorted[:, :2]

projected_data = X_scaled @ top_components

df_pca = pd.DataFrame(projected_data, columns=['PC1', 'PC2'])

df_pca['rank'] = df_cleaned['rank']

df_pca.head()from

For 5 principal component

27
import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np

import pandas as pd

plt.figure(figsize=(8, 5))

plt.plot(range(1, 6), explained_variance_ratios[:5], marker='o', linestyle='--', color='b')

plt.title('Scree Plot')

plt.xlabel('Principal Component')

plt.ylabel('Eigenvalue (Variance)')

28
plt.xticks(range(1, 6))

plt.grid(True)

plt.tight_layout()

plt.show()

# Scatter Plot - PC1 vs PC2

plt.figure(figsize=(8, 6))

# Use 'rank' instead of 'Rank' for the hue, as 'rank' is the column name

sns.scatterplot(x=pca_df_5['PC1'], y=pca_df_5['PC2'], hue=pca_df_5['rank'], palette='coolwarm',


legend=False)

plt.title('PC1 vs PC2 Scatter Plot')

plt.xlabel('Principal Component 1')

plt.ylabel('Principal Component 2')

plt.grid(True)

plt.tight_layout()

plt.show()

# Heatmap - Eigenvectors (Top 5 PCs)

plt.figure(figsize=(10, 6))

# Make sure df_cleaned is defined (from cell 13)

# Get the column names for the heatmap (excluding 'rank')

feature_names = df_cleaned.drop(columns=['rank']).columns

sns.heatmap(eigenvectors_5[:5], cmap='coolwarm', annot=True,

xticklabels=feature_names, yticklabels=[f'PC{i+1}' for i in range(5)])

plt.title('Heatmap of Eigenvectors (Top 5 PCs)')

plt.xlabel('Original Features')

plt.ylabel('Principal Components')

plt.tight_layout()

plt.show()

29
30
EXPERIMENT 6 :- Implement K Means Clustering
with Optimal K Determination.
Here, we used the dataset of Global YouTube Statistics for pca.csv.

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

from google.colab import files

print("Please Upload The Dataset: ")

uploaded = files.upload()

file_name = list(uploaded.keys())[0]

data = pd.read_csv(file_name, encoding="ISO-8859-1")

expected_columns = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'UnitPrice', 'CustomerID',


'Country']

if not set(expected_columns).issubset(data.columns):

raise ValueError(f"Error !! Expected Columns Not Found: {expected_columns}")

# Data Cleaning.

print("Initial Data Size:", data.shape)

data = data.dropna(subset=["CustomerID"]).copy()

print("Drop Missing Data (CustomerID): ", data.shape)

data["TotalAmount"] = data["Quantity"] * data["UnitPrice"]

31
# Aggregation.

customer_data = data.groupby("CustomerID").agg({

"InvoiceNo": "nunique",

"Quantity": "sum",

"TotalAmount": "sum",

"Country": "first"

}).reset_index().rename(columns={"InvoiceNo": "NumInvoices"})

print("Data Aggregation: ", customer_data.shape)

customer_encoded = pd.get_dummies(customer_data, columns=["Country"], drop_first = True)

feature_columns = [col for col in customer_encoded.columns if col != "CustomerID"]

X = customer_encoded[feature_columns].values

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Elbow Method.

inertia = []

K_range = range(1, 11)

for k in K_range:

km = KMeans(n_clusters=k, random_state=42)

km.fit(X_scaled)

inertia.append(km.inertia_)

plt.figure(figsize=(8,5))

plt.plot(K_range, inertia, marker='o')

plt.xlabel('No. of Clusters (K)')

plt.ylabel('Inertia')

plt.title('Elbow Method')

plt.xticks(K_range)

plt.show()

32
# Silhouette Analysis.

sil_scores = []

for k in range(2, 11):

km = KMeans(n_clusters = k, random_state = 42)

labels = km.fit_predict(X_scaled)

sil_scores.append(silhouette_score(X_scaled, labels))

plt.figure(figsize=(8,5))

plt.plot(range(2,11), sil_scores, marker = 'o')

plt.xlabel('No. Of Clusters (K)')

plt.ylabel('Silhouette Score')

plt.title('Silhouette Analysis')

plt.xticks(range(2,11))

plt.show()

# Optimal K Detection Using Silhouette Analysis.

optimal_k = 2 + np.argmax(sil_scores)

print("Optimal K By Silhouette Score:", optimal_k)

final_km = KMeans(n_clusters=optimal_k, random_state=42)

labels = final_km.fit_predict(X_scaled)

33
EXPERIMENT 7 :- Implementation of ANN with
one hidden layer.
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense

from tensorflow.keras.utils import to_categorical

34
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

iris = load_iris()

X, y = iris.data, iris.target

y_cat = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size = 0.2, random_state = 42)

# Feature Standardisation.

scaler_tab = StandardScaler()

X_train = scaler_tab.fit_transform(X_train)

X_test = scaler_tab.transform(X_test)

# Build ANN.

# 1 Hidden Layer With 16 Units.

# Output Layer.

ann = Sequential([

Dense(16, activation='relu', input_shape=(X_train.shape[1],)),

Dense(y_cat.shape[1], activation = 'softmax')

])

ann.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

history = ann.fit(X_train, y_train, validation_split = 0.2, epochs = 50, batch_size = 8)

loss, acc = ann.evaluate(X_test, y_test)

print(f"Test Accuracy: {acc:.3f}, Test Loss: {loss:.3f}")

plt.figure(figsize = (8, 4.2))

plt.plot(history.history['accuracy'], label = 'Train Accuracy')

plt.plot(history.history['val_accuracy'], label='Validation Accuracy')

plt.title('ANN Training Accuracy')

plt.xlabel('Epoch')

plt.ylabel('Accuracy')

35
plt.legend()

plt.grid(True)

plt.show()

After regularsiation

36
EXPERIMENT 8 :- Develop CNN Architecture for
Object Detection in an Image and apply various
Regularization Techniques.
# AVIF Support.

!pip install pillow-avif-plugin imageio[ mpeg]

import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow.keras.applications import VGG16

from tensorflow.keras.models import Model

from tensorflow.keras.layers import BatchNormalization, Dropout, Conv2D

from tensorflow.keras.regularizers import l2

37
import imageio.v3 as iio

from google.colab import files

from PIL import Image

Image.init()

print("Upload The Image:")

scene_up = files.upload()

scene_path = list(scene_up.keys())[0]

print("Upload The Object:")

tmpl_up = files.upload()

tmpl_path = list(tmpl_up.keys())[0]

# Pre-Process Image Into An Array.

def load_and_preprocess(img_path, target_size=None):

arr = iio.imread(img_path)

if arr.ndim == 3 and arr.shape[2] == 4:

arr = arr[..., :3]

if target_size is not None:

arr = tf.image.resize(arr, target_size).numpy().astype(np.uint8)

arr = arr.astype('float32') / 255.0

return arr

scene = load_and_preprocess(scene_path)

tmpl = load_and_preprocess(tmpl_path)

# Regularisation.

# Batch Normalisation.

base = VGG16(weights = 'imagenet', include_top = False, input_shape = (None, None, 3))

for layer in base.layers:

38
if isinstance(layer, Conv2D):

layer.kernel_regularizer = l2(1e-4)

x = base.output

x = BatchNormalization()(x)

x = Dropout(0.3)(x)

feature_extractor = Model(inputs=base.input, outputs=x)

# Feature Extraction.

def extract_feat(img):

inp = np.expand_dims(img, axis=0)

feat = feature_extractor.predict(inp)

return feat[0]

feat_scene = extract_feat(scene)

h_s, w_s, _ = feat_scene.shape

scale = scene.shape[0] / h_s

h_t_orig, w_t_orig, _ = tmpl.shape

target_size = (int(h_t_orig/scale), int(w_t_orig/scale))

tmpl_scaled = load_and_preprocess(tmpl_path, target_size = target_size)

feat_tmpl = extract_feat(tmpl_scaled)

def match_heatmap(feat_scene, feat_tmpl):

h_s, w_s, c = feat_scene.shape

h_t, w_t, _ = feat_tmpl.shape

heatmap = np.zeros((h_s - h_t + 1, w_s - w_t + 1))

tmpl_vec = feat_tmpl.flatten()

tmpl_vec /= np.linalg.norm(tmpl_vec)

for i in range(heatmap.shape[0]):

for j in range(heatmap.shape[1]):

39
patch = feat_scene[i:i+h_t, j:j+w_t, :].flatten()

patch /= np.linalg.norm(patch)

heatmap[i, j] = np.dot(tmpl_vec, patch)

return heatmap

heatmap = match_heatmap(feat_scene, feat_tmpl)

# Best Match (Heatmap).

y, x = np.unravel_index(np.argmax(heatmap), heatmap.shape)

h_scaled, w_scaled = target_size

y1, x1 = int(y * scale), int(x * scale)

y2, x2 = int((y + h_scaled) * scale), int((x + w_scaled) * scale)

plt.figure(figsize=(8, 8))

plt.imshow(scene)

plt.gca().add_patch(plt.Rectangle((x1, y1), x2-x1, y2-y1, edgecolor = 'r', linewidth = 2, fill = False))

plt.title('Detected Object')

plt.axis('o ')

plt.show()

plt.plot(history.history['accuracy'], label='Train Accuracy')

plt.plot(history.history['val_accuracy'], label='Val Accuracy')

plt.xlabel('Epoch')

40
plt.ylabel('Accuracy')

plt.legend()

plt.title('Training vs Validation Accuracy')

plt.show()

from tensorflow.keras.preprocessing import image

img_path = test_generator.filepaths[0]

img = image.load_img(img_path, target_size=(150, 150))

plt.imshow(img)

plt.axis('o ')

img_array = image.img_to_array(img) / 255.0

img_array = np.expand_dims(img_array, axis=0)

prediction = model.predict(img_array)

predicted_class = list(test_generator.class_indices.keys())[np.argmax(prediction)]

print(f"Predicted Class: {predicted_class}")

41
42

You might also like