ML All Projectpdf Removed
ML All Projectpdf Removed
2
EXPERIMENT 1 :- Implement Linear Regression
Model with Regularization & Cross Validation.
In This Case, We Have Taken 2 Combined Datasets (2023 - 24) :-
1. We take student performance dataset to know about whether they
will pass the exam or fail
2. Student unemployment rate dataset
Data Has Been Officially Collected From Kaggle.
Now, We Will Plot This Data With 1st Dataset On X Axis And 2nd Dataset
On Y Axis.
import pandas as pd
import numpy as np
df = pd.read_csv('StudentsPerformance.csv')
y = df['math score']
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)
plt.figure(figsize=(7,5))
3
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Scores')
plt.ylabel('Predicted Scores')
plt.colorbar(label='Writing Score')
plt.grid(True)
plt.show()
plt.figure(figsize=(10,5))
cv_scores = []
m = model(alpha=alpha)
scores = []
m.fit(X_tr, y_tr)
scores.append(r2_score(y_val, m.predict(X_val)))
cv_scores.append(np.mean(scores))
color='red', zorder=5)
plt.ylabel('Cross-Validated R² Score')
plt.legend()
4
plt.grid(True)
plt.show()
models = {
results = []
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cv_scores = []
model.fit(X_tr, y_tr)
cv_scores.append(r2_score(y_val, model.predict(X_val)))
results.append({
'Model': name,
})
print(pd.DataFrame(results).round(3).to_string(index=False))
5
Implement Elastic Net Regularization, Cross Validation And Then
Comparing Adjusted R2 Values :-
import pandas as pd
import numpy as np
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
6
data = pd.ExcelFile(file_name)
sheet_name = "UnemploymentRate(23-24)"
df = data.parse(sheet_name)
scaler = MinMaxScaler()
ndf['State/UT-Wise'] = data_for_analysis['State/UT-Wise']
X = ndf[['Open']].values
y = ndf['Women'].values
n, p = X.shape
# Regularisation.
models = {
'OLS': LinearRegression(),
return 1 - (1 - r2) * (n - 1) / (n - p - 1)
results = []
model.fit(X, y)
y_pred = model.predict(X)
full_adj_r2 = adjusted_r2(full_r2, n, p)
cv_adj = adjusted_r2(np.mean(cv_r2), n, p)
7
results.append({
'Model': name,
})
res_df = pd.DataFrame(results)
print(res_df)
res_df.set_index('Model')[['Full Adj R2', 'Cross Validated Adj R2']].plot(kind = 'bar', figsize=(8, 5))
plt.ylabel('Adjusted R2')
plt.grid(True)
plt.show()
8
EXPERIMENT 2 :- Develop Naïve Bayes
Classification From Scratch With Laplace
Smoothing.
In this case, we have taken A standard Dataset Adult dataset widely used
for ML Training and Testing Model.
import numpy as np
import pandas as pd
class NaiveBayesClassifier:
self.classes = np.unique(y)
self.conditional_probs = {}
for c in self.classes:
X_c = X[y == c]
self.conditional_probs[c] = {
predictions = []
for _, x in X.iterrows():
posteriors = {}
for c in self.classes:
prior = np.log(self.priors[c])
9
posteriors[c] = prior + conditional
predictions.append(max(posteriors, key=posteriors.get))
return np.array(predictions)
for x in X:
posteriors = {}
for c in self.classes:
prior = np.log(self.priorClass[c])
likelihood = 0
return np.array(predictions)
import pandas as pd
column_names = [
df.dropna(inplace=True)
prior_probabilities = df['income'].value_counts(normalize=True)
10
conditional_probabilities =
df.groupby('income')['education'].value_counts(normalize=True).unstack()
# train.py
import pandas as pd
import numpy as np
import streamlit as st
df.columns = [
11
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
df.dropna(inplace=True)
categorical_cols = df.select_dtypes(include='object').columns
label_encoders = {}
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
X = df.drop('income', axis=1)
y = df['income']
feature_names = X.columns.tolist()
model = CategoricalNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
12
input_data = []
if feature in categorical_cols:
options = label_encoders[feature].classes_
encoded_value = label_encoders[feature].transform([value])[0]
else:
encoded_value = value
input_data.append(encoded_value)
prediction = model.predict(input_df)[0]
prediction_proba = model.predict_proba(input_df)[0]
st.subheader('Prediction')
st.write("Probability:", prediction_proba)
13
EXPERIMENT 3 :- Implement KNN with Elbow
Method to determine optimal K. Also, try Euclidean
and Hamming Distances.
We took a standard dataset which is diabetis dataset.
import pandas as pd
import numpy as np
classification_report, f1_score)
df = pd.read_csv('diabetes datset.csv')
# Handle missing values (replace zeros with NaN for relevant columns)
df[col] = df[col].fillna(df[col].median())
X = df.drop('Outcome', axis=1)
y = df['Outcome']
14
# Standardize the features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
error_rates = []
error_rate = 1 - cv_scores.mean()
error_rates.append(error_rate)
plt.figure(figsize=(10, 6))
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.grid()
plt.show()
return error_rates
15
Hamming distance
16
print(f"Optimal K for Euclidean distance: {optimal_k_euclidean}")
optimal_k_hamming = np.argmin(hamming_errors) + 1
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate performance
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
17
return knn, accuracy, f1
k=optimal_k_euclidean,
distance_metric='euclidean'
k=optimal_k_hamming,
Performance comparsion
18
EXPERIMENT 4 :- Implement Logistic Regression
and plot Impact of Variation in Threshold.
In this model, we have used dataset of diabetis that give good result in
logistic regression
import pandas as pd
import numpy as np
df = pd.read_csv('diabetes datset.csv')
19
for col in zero_cols:
df[col] = df[col].fillna(df[col].median())
X = df.drop('Outcome', axis=1)
y = df['Outcome']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
weights = np.zeros(X.shape[1])
z = np.dot(X, weights)
predictions = sigmoid(z)
error = predictions - y
weights -= lr * gradient
if epoch % 100 == 0:
return weights
20
def predict(X, weights, threshold=0.58):
results_df_30 = pd.DataFrame({
})
print("\nSample Predictions:\n")
print(results_df_30)
plt.xlabel('Threshold', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.xticks(thresholds)
plt.legend(loc='best', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
21
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
plt.subplot(3, 3, i+1)
plt.title(col)
plt.tight_layout()
22
plt.legend()
plt.show()
23
EXPERIMENT 5 :- Implement Principal
Component Analysis with Normalised Attributes
and Dimension Reduction.
We can create a synthetic dataset and then apply PCA on it. Here, we
used a generator for dataset with 5 features thus, leading to
dimensionality as 5.
# Apply PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
24
# Logistic Regression with PCA
log_reg_pca = LogisticRegression(max_iter=1000)
log_reg_pca.fit(X_train_pca, y_train)
y_pred_log_pca = log_reg_pca.predict(X_test_pca)
accuracy_log_pca = accuracy_score(y_test, y_pred_log_pca)
print(f"Logistic Regression with PCA Accuracy: {accuracy_log_pca:.4f}")
# Compare results
25
print("\nModel Comparison:")
print(f"Logistic Regression: {accuracy_log:.4f} ->
{accuracy_log_pca:.4f} with PCA")
print(f"KNN: {accuracy_knn:.4f} -> {accuracy_knn_pca:.4f} with PCA")
import numpy as np
# Drop non-numeric / text-heavy or identifier columns that are not useful for PCA
df_cleaned = df.drop(columns=drop_cols)
df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())
df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])
le = LabelEncoder()
df_cleaned[col] = le.fit_transform(df_cleaned[col])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cleaned.drop(columns=['rank']))
# Covariance matrix
26
cov_matrix = np.cov(X_scaled.T)
# Eigen decomposition
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[sorted_indices]
df_pca['rank'] = df_cleaned['rank']
df_pca.head()from
27
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.figure(figsize=(8, 5))
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue (Variance)')
28
plt.xticks(range(1, 6))
plt.grid(True)
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 6))
# Use 'rank' instead of 'Rank' for the hue, as 'rank' is the column name
plt.grid(True)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
feature_names = df_cleaned.drop(columns=['rank']).columns
plt.xlabel('Original Features')
plt.ylabel('Principal Components')
plt.tight_layout()
plt.show()
29
30
EXPERIMENT 6 :- Implement K Means Clustering
with Optimal K Determination.
Here, we used the dataset of Global YouTube Statistics for pca.csv.
import pandas as pd
import numpy as np
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
if not set(expected_columns).issubset(data.columns):
# Data Cleaning.
data = data.dropna(subset=["CustomerID"]).copy()
31
# Aggregation.
customer_data = data.groupby("CustomerID").agg({
"InvoiceNo": "nunique",
"Quantity": "sum",
"TotalAmount": "sum",
"Country": "first"
}).reset_index().rename(columns={"InvoiceNo": "NumInvoices"})
X = customer_encoded[feature_columns].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Elbow Method.
inertia = []
for k in K_range:
km = KMeans(n_clusters=k, random_state=42)
km.fit(X_scaled)
inertia.append(km.inertia_)
plt.figure(figsize=(8,5))
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.xticks(K_range)
plt.show()
32
# Silhouette Analysis.
sil_scores = []
labels = km.fit_predict(X_scaled)
sil_scores.append(silhouette_score(X_scaled, labels))
plt.figure(figsize=(8,5))
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.xticks(range(2,11))
plt.show()
optimal_k = 2 + np.argmax(sil_scores)
labels = final_km.fit_predict(X_scaled)
33
EXPERIMENT 7 :- Implementation of ANN with
one hidden layer.
from sklearn.datasets import load_iris
34
from sklearn.preprocessing import StandardScaler
iris = load_iris()
X, y = iris.data, iris.target
y_cat = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size = 0.2, random_state = 42)
# Feature Standardisation.
scaler_tab = StandardScaler()
X_train = scaler_tab.fit_transform(X_train)
X_test = scaler_tab.transform(X_test)
# Build ANN.
# Output Layer.
ann = Sequential([
])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
35
plt.legend()
plt.grid(True)
plt.show()
After regularsiation
36
EXPERIMENT 8 :- Develop CNN Architecture for
Object Detection in an Image and apply various
Regularization Techniques.
# AVIF Support.
import numpy as np
import tensorflow as tf
37
import imageio.v3 as iio
Image.init()
scene_up = files.upload()
scene_path = list(scene_up.keys())[0]
tmpl_up = files.upload()
tmpl_path = list(tmpl_up.keys())[0]
arr = iio.imread(img_path)
return arr
scene = load_and_preprocess(scene_path)
tmpl = load_and_preprocess(tmpl_path)
# Regularisation.
# Batch Normalisation.
38
if isinstance(layer, Conv2D):
layer.kernel_regularizer = l2(1e-4)
x = base.output
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
# Feature Extraction.
def extract_feat(img):
feat = feature_extractor.predict(inp)
return feat[0]
feat_scene = extract_feat(scene)
feat_tmpl = extract_feat(tmpl_scaled)
tmpl_vec = feat_tmpl.flatten()
tmpl_vec /= np.linalg.norm(tmpl_vec)
for i in range(heatmap.shape[0]):
for j in range(heatmap.shape[1]):
39
patch = feat_scene[i:i+h_t, j:j+w_t, :].flatten()
patch /= np.linalg.norm(patch)
return heatmap
y, x = np.unravel_index(np.argmax(heatmap), heatmap.shape)
plt.figure(figsize=(8, 8))
plt.imshow(scene)
plt.title('Detected Object')
plt.axis('o ')
plt.show()
plt.xlabel('Epoch')
40
plt.ylabel('Accuracy')
plt.legend()
plt.show()
img_path = test_generator.filepaths[0]
plt.imshow(img)
plt.axis('o ')
prediction = model.predict(img_array)
predicted_class = list(test_generator.class_indices.keys())[np.argmax(prediction)]
41
42