0% found this document useful (0 votes)
10 views9 pages

ML Manual

The document consists of a series of labs focusing on data analysis and machine learning techniques using Python. It covers various topics including data preprocessing, exploratory data analysis, regression models, decision trees, clustering, and evaluation metrics. Each lab demonstrates practical implementations with different datasets and visualizations.

Uploaded by

Bharath N
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views9 pages

ML Manual

The document consists of a series of labs focusing on data analysis and machine learning techniques using Python. It covers various topics including data preprocessing, exploratory data analysis, regression models, decision trees, clustering, and evaluation metrics. Each lab demonstrates practical implementations with different datasets and visualizations.

Uploaded by

Bharath N
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

Lab 1:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r'housing.csv')
df.head()
df.shape
df.info()
df.nunique()
df.isnull().sum()
df.duplicated().sum()
df['total_bedrooms'].median()

df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
for i in df.iloc[:,2:7]:
df[i] = df[i].astype('int')
df.describe().T
Numerical = df.select_dtypes(include=[np.number]).columns
print(Numerical)
for col in Numerical:
plt.figure(figsize=(10, 6))
df[col].plot(kind='hist', tle=col, bins=60, edgecolor='black')
plt.ylabel('Frequency')
plt.show()
for col in Numerical:
plt.figure(figsize=(6, 6))
sns.boxplot(df[col], color='blue')
plt. tle(col)
plt.ylabel(col)
plt.show()

Lab 2:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r'housing.csv')

variable_meaning = {
"MedInc": "Median income in block group",
"HouseAge": "Median house age in block group",
"AveRooms": "Average number of rooms per household",
"AveBedrms": "Average number of bedrooms per household",
"Popula on": "Popula on of block group",
"AveOccup": "Average number of household members",
"La tude": "La tude of block group",
"Longitude": "Longitude of block group",
"Target": "Median house value (in $100,000s)"
}
variable_df = pd.DataFrame(list(variable_meaning.items()), columns=["Feature", "Descrip on"])
print("\nVariable Meaning Table:")
print(variable_df)

print("\nBasic Informa on about Dataset:")


print(df.info())

print("\nFirst Five Rows of Dataset:")


print(df.head())

print("\nSummary Sta s cs:")


print(df.describe().T)

print("\nMissing Values in Each Column:")


print(df.isnull().sum())

plt.figure(figsize=(12, 8))
df.hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.sup tle("Feature Distribu ons", fontsize=16)
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=[np.number]))
plt.x cks(rota on=45)
plt. tle("Boxplots of Features to Iden fy Outliers")
plt.show()

correla on_matrix = df.select_dtypes(include=[np.number]).corr()

print(correla on_matrix)

plt.figure(figsize=(10, 8))
sns.heatmap(correla on_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt. tle("Correla on Heatmap of Features")
plt.show()

sns.pairplot(df.select_dtypes(include=[np.number]).sample(300), corner=True)
plt.sup tle("Pair Plot of Sampled Numerical Features", y=1.02)
plt.show()
Lab 3:

from sklearn.datasets import load_iris


from sklearn.decomposi on import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt

iris = load_iris()
features = iris.data
target = iris.target

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_standardized)

pca_df = pd.DataFrame(data=features_pca, columns=["Principal Component 1", "Principal Component 2"])


pca_df["Target"] = target
plt.figure(figsize=(8, 6))
for label, color in zip(iris.target_names, ["red", "green", "blue"]):
plt.sca er(
pca_df.loc[pca_df["Target"] == list(iris.target_names).index(label), "Principal Component 1"],
pca_df.loc[pca_df["Target"] == list(iris.target_names).index(label), "Principal Component 2"],
label=label,
alpha=0.7
)
plt. tle("PCA on Iris Dataset (4 features to 2 features)", fontsize=14)
plt.xlabel("Principal Component 1", fontsize=12)
plt.ylabel("Principal Component 2", fontsize=12)
plt.legend( tle="Species")
plt.grid()
plt.show()
Lab 4:

import pandas as pd

data = pd.read_csv(r"training_data.csv")

def find_s_algorithm(data):
a ributes = data.iloc[:, :-1].values
target = data.iloc[:, -1].values

for i in range(len(target)):
if target[i] == "Yes":
hypothesis = a ributes[i].copy()
break

for i in range(len(target)):
if target[i] == "Yes":
for j in range(len(hypothesis)):
if hypothesis[j] != a ributes[i][j]:
hypothesis[j] = '?'

return hypothesis

final_hypothesis = find_s_algorithm(data)
print("Most Specific Hypothesis:", final_hypothesis)

lab5:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selec on import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
values = np.random.rand(100)
labels = []
for i in values[:50]:
if i <= 0.5:
labels.append('Class1')
else:
labels.append('Class2')
labels += [None] * 50
data = {
"Point": [f"x{i+1}" for i in range(100)],
"Value": values,
"Label": labels
}
print(data)
type(data)
print(dict)
df = pd.DataFrame(data)
df.nunique()
df.shape
df.describe().T
df.isnull().sum()
num_col = df.select_dtypes(include=['int', 'float']).columns
for col in num_col:
df[col].hist(bins=10, alpha=0.5, edgecolor='black', grid=False)
plt. tle(f'Histogram for {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()

lab 6:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spa al.distance import cdist

df_lwr = pd.read_csv(r'C:\Users\BHARATH N\Downloads\lwr_dataset.csv')

X_train = df_lwr[['X']].values
y_train = df_lwr['Y'].values
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])

tau = 0.5
X_range = np.linspace(X_train[:, 1].min(), X_train[:, 1].max(), 100)
y_pred = []

for x in X_range:
x_vec = np.array([1, x]) # Intercept term
weights = np.exp(-cdist([[x]], X_train[:, 1:], 'sqeuclidean') / (2
* tau**2)).fla en()
W = np.diag(weights)
theta = np.linalg.pinv(X_train.T @ W @ X_train) @ (X_train.T @ W @
y_train)
y_pred.append(x_vec @ theta)

plt.sca er(X_train[:, 1], y_train, label='Data')


plt.plot(X_range, y_pred, color='red', label='LWR')
plt.legend()
plt. tle("Locally Weighted Regression")
plt.show()

Lab 7:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selec on import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

def linear_regression_california():
housing = fetch_california_housing(as_frame=True)
X = housing.data[["AveRooms"]]
y = housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
plt.sca er(X_test, y_test, color="blue", label="Actual")
plt.plot(X_test, y_pred, color="red", label="Predicted")
plt.xlabel("Average number of rooms (AveRooms)")
plt.ylabel("Median value of homes ($100,000)")
plt. tle("Linear Regression - California Housing Dataset")
plt.legend()
plt.show()
print("Linear Regression - California Housing Dataset")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

def polynomial_regression_auto_mpg():
url = "h ps://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "accelera on", "model_year",
"origin"]
data = pd.read_csv(url, sep='\\s+', names=column_names, na_values="?")
data = data.dropna()
X = data["displacement"].values.reshape(-1, 1)
y = data["mpg"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
poly_model = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), LinearRegression())
poly_model.fit(X_train, y_train)
y_pred = poly_model.predict(X_test)
plt.sca er(X_test, y_test, color="blue", label="Actual")
plt.sca er(X_test, y_pred, color="red", label="Predicted")
plt.xlabel("Displacement")
plt.ylabel("Miles per gallon (mpg)")
plt. tle("Polynomial Regression - Auto MPG Dataset")
plt.legend()
plt.show()
print("Polynomial Regression - Auto MPG Dataset")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

if __name__ == "__main__":
print("Demonstra ng Linear Regression and Polynomial Regression\n")
linear_regression_california()
polynomial_regression_auto_mpg()
lab 8:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selec on import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y,


test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


print(f"Model Accuracy: {accuracy * 100:.2f}%")
new_sample = np.array([X_test[0]])
predic on = clf.predict(new_sample)

predic on_class = "Benign" if predic on == 1 else "Malignant"


print(f"Predicted Class for the new sample: {predic on_class}")

plt.figure(figsize=(12,8))
tree.plot_tree(clf, filled=True, feature_names=data.feature_names,
class_names=data.target_names)
plt. tle("Decision Tree - Breast Cancer Dataset")
plt.show()
lab 9:

import numpy as np
from sklearn.datasets import fetch_olive _faces
from sklearn.model_selec on import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classifica on_report, confusion_matrix
import matplotlib.pyplot as plt

data = fetch_olive _faces(shuffle=True, random_state=42)


X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print("\nClassifica on Report:")
print(classifica on_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
print(f'\nCross-valida on accuracy: {cross_val_accuracy.mean() * 100:.2f}%')
fig, axes = plt.subplots(3, 5, figsize=(12, 8))
for ax, image, label, predic on in zip(axes.ravel(), X_test, y_test, y_pred):
ax.imshow(image.reshape(64, 64), cmap=plt.cm.gray)
ax.set_ tle(f"True: {label}, Pred: {predic on}")
ax.axis('off')
plt.show()

lab 10:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposi on import PCA
from sklearn.metrics import confusion_matrix, classifica on_report

data = load_breast_cancer()
X = data.data
y = data.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=2, random_state=42)


y_kmeans = kmeans.fit_predict(X_scaled)
print("Confusion Matrix:")
print(confusion_matrix(y, y_kmeans))
print("\nClassifica on Report:")
print(classifica on_report(y, y_kmeans))

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])


df['Cluster'] = y_kmeans
df['True Label'] = y

plt.figure(figsize=(8, 6))
sns.sca erplot(data=df, x='PC1', y='PC2', hue='Cluster', pale e='Set1', s=100, edgecolor='black', alpha=0.7)
plt. tle('K-Means Clustering of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend( tle="Cluster")
plt.show()

plt.figure(figsize=(8, 6))
sns.sca erplot(data=df, x='PC1', y='PC2', hue='True Label', pale e='coolwarm', s=100, edgecolor='black', alpha=0.7)
plt. tle('True Labels of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend( tle="True Label")
plt.show()
plt.figure(figsize=(8, 6))
sns.sca erplot(data=df, x='PC1', y='PC2', hue='Cluster', pale e='Set1', s=100, edgecolor='black', alpha=0.7)
centers = pca.transform(kmeans.cluster_centers_)
plt.sca er(centers[:, 0], centers[:, 1], s=200, c='red', marker='X', label='Centroids')
plt. tle('K-Means Clustering with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend( tle="Cluster")
plt.show()

You might also like