ML Labmanual
ML Labmanual
Data Cleaning
df.isnull().sum()
df.duplicated().sum()
df['total_bedrooms'].median()
# Handling missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
Feature Engineering
for i in df.iloc[:,2:7]:
df[i] = df[i].astype('int')
Descriptive Statistics
df.describe().T
Numerical = df.select_dtypes(include=[np.number]).columns
print(Numerical)
Uni-Variate Analysis
for col in Numerical:
plt.figure(figsize=(10, 6))
df[col].plot(kind='hist', title=col, bins=60, edgecolor='black')
plt.ylabel('Frequency')
plt.show()
for col in Numerical:
plt.figure(figsize=(6, 6))
sns.boxplot(df[col], color='blue')
plt.title(col)
plt.ylabel(col)
plt.show()
Experiment 2
Develop a program to Compute the correlation matrix to understand the
relationships between pairs of features. Visualize the correlation matrix using
a heatmap to know which variables have strong positive/negative correlations.
Create a pair plot to visualize pairwise relationships between features. Use
California Housing dataset.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
# Load California Housing dataset
data = fetch_california_housing()
# Convert to DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target # Adding the target variable (median house value)
# Table of Meaning of Each Variable
variable_meaning = {
"MedInc": "Median income in block group",
"HouseAge": "Median house age in block group",
"AveRooms": "Average number of rooms per household",
"AveBedrms": "Average number of bedrooms per household",
"Population": "Population of block group",
"AveOccup": "Average number of household members",
"Latitude": "Latitude of block group",
"Longitude": "Longitude of block group",
"Target": "Median house value (in $100,000s)"
}
variable_df = pd.DataFrame(list(variable_meaning.items()), columns=["Feature", "Des
print("\nVariable Meaning Table:")
print(variable_df)
# Basic Data Exploration
print("\nBasic Information about Dataset:")
print(df.info()) # Overview of dataset
print("\nFirst Five Rows of Dataset:")
print(df.head()) # Display first few rows
# Check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum()) # Count of missing values
# Histograms for distribution of features
plt.figure(figsize=(12, 8))
df.hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()
# Boxplots for outlier detection
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.title("Boxplots of Features to Identify Outliers")
plt.show()
# Correlation Matrix
plt.figure(figsize=(10, 6))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()
# Pairplot to analyze feature relationships (only a subset for clarity)
sns.pairplot(df[['MedInc', 'HouseAge', 'AveRooms', 'Target']], diag_kind='kde')
plt.show()
# Insights from Data Exploration
print("\nKey Insights:")
print("1. The dataset has", df.shape[0], "rows and", df.shape[1], "columns.")
print("2. No missing values were found in the dataset.")
print("3. Histograms show skewed distributions in some features like 'MedInc'.")
print("4. Boxplots indicate potential outliers in 'AveRooms' and 'AveOccup'.")
print("5. Correlation heatmap shows 'MedInc' has the highest correlation with house prices.")
return hypothesis
# Run Find-S Algorithm
final_hypothesis = find_s_algorithm(data)
1. Label the first 50 points {x1,......,x50} as follows: if (xi ≤ 0.5), then xi ∊ Class1, else xi
∊ Class1
2. Classify the remaining points, x51,......,x100 using KNN. Perform this for
k=1,2,3,4,5,20,30
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
values = np.random.rand(100)
labels = []
for i in values[:50]:
if i <=0.5:
labels.append('Class1')
else:
labels.append('Class2')
labels += [None] * 50
data = {
"Value": values,
"Label": labels
print(data)
type(data)
df = pd.DataFrame(data)
df.head()
df.nunique()
df.shape
df.info()
df.describe().T
df.isnull().sum()
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()
labeled_df = df[df["Label"].notna()]
X_train = labeled_df[["Value"]]
y_train = labeled_df["Label"]
unlabeled_df = df[df["Label"].isna()]
X_test = unlabeled_df[["Value"]]
results = {}
accuracies = {}
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
results[k] = predictions
# Calculate accuracy
accuracies[k] = accuracy
print(f"Accuracy for k={k}: {accuracy:.2f}%")
unlabeled_df[f"Label_k{k}"] = predictions
print(predictions)
df1
# Display accuracies
print(f"k={k}: {acc:.2f}%")
Experiment 6: Implement the non-parametric Locally Weighted Regression algorithm in
order to fit data points. Select appropriate data set for your experiment and draw graphs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from scipy.spatial.distance import cdist
# Load datasets
df_linear = pd.read_csv("linear_dataset.csv")
df_lwr = pd.read_csv("lwr_dataset.csv")
df_poly = pd.read_csv("polynomial_dataset.csv")
# Linear Regression
def linear_regression(df):
X, y = df[['X']], df['Y']
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
plt.scatter(X, y, label='Data')
plt.plot(X, y_pred, color='red', label='Linear Regression')
plt.legend()
plt.title("Linear Regression")
plt.show()
linear_regression(df_linear)
# Locally Weighted Regression (LWR)
def gaussian_kernel(x, X, tau):
return np.exp(-cdist([[x]], X, 'sqeuclidean') / (2 * tau**2))
for x in X_range:
x_vec = np.array([1, x]) # Intercept term
weights = gaussian_kernel(x, X_train[:, 1:], tau).flatten()
W = np.diag(weights)
locally_weighted_regression(df_lwr[['X']].values, df_lwr['Y'].values)
# Polynomial Regression
def polynomial_regression(df, degree=3):
X, y = df[['X']], df['Y']
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X, y)
y_pred = model.predict(X)
plt.scatter(X, y, label='Data')
plt.plot(X, y_pred, color='red', label=f'Polynomial Regression (deg={degree})')
plt.legend()
plt.title("Polynomial Regression")
plt.show()
polynomial_regression(df_poly, degree=3)
Experiment 7 A: Develop a program to demonstrate the working of Linear Regression and
Polynomial Regression. Use Boston Housing Dataset for Linear Regression and Auto MPG
Dataset (for vehicle fuel efficiency prediction) for Polynomial Regression.
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv(r"C:\Users\Asus\Documents\ML6thSEM_FDP_Day2\ML6thSEM_FDP_Day2\
Experiment_7_Lin_Poly_reg\Boston housing dataset.csv")
data.head()
data.shape
data.info()
data.nunique()
data.ZN.unique()
# **Data Cleaning**
data.isnull().sum()
data.duplicated().sum()
df = data.copy()
df.isnull().sum()
df.head()
df['CHAS'] = df['CHAS'].astype('int')
df.describe().T
for i in df.columns:
plt.figure(figsize=(6,3))
plt.subplot(1, 2, 1)
plt.title(f'Histogram of {i}')
plt.xlabel(i)
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
plt.boxplot(df[i], vert=False)
plt.title(f'Boxplot of {i}')
plt.show()
corr = df.corr(method='pearson')
plt.figure(figsize=(10, 8))
plt.xticks(rotation=90, ha='right')
plt.yticks(rotation=0)
plt.show()
scale = StandardScaler()
X_scaled = scale.fit_transform(X)
# Split the data into training (80%) and testing (20%) sets
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
sns.get_dataset_names()
data = sns.load_dataset('mpg')
data.head()
data.shape
data.info()
data.nunique()
data.horsepower.unique()
data.isnull().sum()
data.duplicated().sum()
df = data.copy()
df['horsepower'].fillna(df['horsepower'].median(), inplace=True)
df.describe().T
### **EDA**
numerical = df.select_dtypes(include=['int','float']).columns
categorical = df.select_dtypes(include=['object']).columns
print(numerical)
print(categorical)
for i in numerical:
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
df[i].hist(bins=20, alpha=0.5, color='b',edgecolor='black')
plt.title(f'Histogram of {i}')
plt.xlabel(i)
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
plt.boxplot(df[i], vert=False)
plt.title(f'Boxplot of {i}')
plt.show()
import seaborn as sns
for col in categorical:
plt.figure(figsize=(6, 6))
sns.countplot(x=col, data=df, order=df[col].value_counts().sort_values().head(10).index,
palette='viridis')
plt.title(f'Countplot of {col}')
plt.xticks(rotation=90)
plt.show()
# Select the relevant features
X = df[['horsepower']] # You can select other features here
y = df['mpg']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv(r'C:\Users\Admin\OneDrive\Documents\MachineLearning Lab\Datasets\
Breast Cancer Dataset.csv')
pd.set_option('display.max_columns', None)
data.head()
data.shape
data.info()
data.diagnosis.unique()
Data Preprocessing
Data Cleaning
data.isnull().sum()
data.duplicated().sum()
df = data.drop(['id'], axis=1)
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0}) # Malignant:1, Benign:0
df.describe().T
corr = df.corr(method='pearson')
plt.figure(figsize=(18, 10))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.xticks(rotation=90, ha='right')
plt.yticks(rotation=0)
plt.title("Correlation Matrix Heatmap")
plt.show()
for feature in X:
ig = information_gain(df,feature,'diagnosis')
print(f"Information Gain for {feature}: {ig}")
# Export the tree to DOT format
dot_data = export_graphviz(model, out_file=None,
feature_names=X_train.columns,
rounded=True, proportion=False,
precision=2, filled=True)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
data = fetch_olivetti_faces()
data.keys()
print("Data Shape:", data.data.shape)
print("Target Shape:", data.target.shape)
print("There are {} unique persons in the dataset".format(len(np.unique(data.target))))
print("Size of each image is {}x{}".format(data.images.shape[1],data.images.shape[1]))
def print_faces(images, target, top_n):
# Ensure the number of images does not exceed available data
top_n = min(top_n, len(images))
for i, ax in enumerate(axes.ravel()):
if i < top_n:
ax.imshow(images[i], cmap='bone')
ax.axis('off')
ax.text(2, 12, str(target[i]), fontsize=9, color='red')
ax.text(2, 55, f"face: {i}", fontsize=9, color='blue')
else:
ax.axis('off')
plt.show()
#let us extract unique charaters present in dataset
def display_unique_faces(pics):
fig = plt.figure(figsize=(24, 10)) # Set figure size
columns, rows = 10, 4 # Define grid dimensions
print("x_train: ",x_train.shape)
print("x_test: ",x_test.shape)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
# Calculate accuracy
nb_accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
# Calculate accuracy
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Multinomial Naive Bayes Accuracy: {accuracy}%")
Calculate the number of misclassified images
misclassified_idx = np.where(y_pred != y_test)[0]
num_misclassified = len(misclassified_idx)