Machine Learning Lab Manual (1)
Machine Learning Lab Manual (1)
Develop a program to create histograms for all numerical features and analyze the
distribution of each feature. Generate box plots for all numerical features and
identify any outliers. Use California Housing dataset.
import pandas as pd
import ssl
ssl._create_default_https_context = ssl._create_stdlib_context
california_housing = fetch_california_housing()
data['Target'] = california_housing.target
def plot_histograms(data):
plt.figure(figsize=(15, 10))
data[feature].hist(bins=30, edgecolor='black')
plt.title(f'Histogram of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
plot_histograms(data)
OUTPUT:
import pandas as pd
import numpy as np
data = fetch_california_housing(as_frame=True)
housing_df = data.frame
numerical_features = housing_df.select_dtypes(include=[np.number]).columns
# Plot histograms
plt.figure(figsize=(15, 10))
plt.subplot(3, 3, i + 1)
plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.boxplot(x=housing_df[feature], color='orange')
plt.tight_layout()
plt.show()
print("Outliers Detection:")
outliers_summary = {}
Q1 = housing_df[feature].quantile(0.25)
Q3 = housing_df[feature].quantile(0.75)
IQR = Q3 - Q1
outliers_summary[feature] = len(outliers)
print("\nDataset Summary:")
print(housing_df.describe())
Outliers Detection:
HouseAge: 0 outliers
Latitude: 0 outliers
Longitude: 0 outliers
Dataset Summary:
[8 rows x 9 columns]
Program 2: Develop a program to Compute the correlation matrix to understand
the relationships between pairs of features. Visualize the correlation matrix using a
heatmap to know which variables have strong positive/negative correlations.
Create a pair plot to visualize pairwise relationships between features. Use
California Housing dataset.
import pandas as pd
correlation_matrix = data.corr()
import numpy as np
import pandas as pd
iris = load_iris()
data = iris.data
labels = iris.target
label_names = iris.target_names
pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data)
reduced_df['Label'] = labels
# Plot the reduced data
plt.figure(figsize=(8, 6))
plt.scatter(
label=label_names[label],
color=colors[i]
plt.legend()
plt.grid()
plt.show()
Program 4: For a given set of training data examples stored in a .CSV file,
implement and demonstrate the Find-S algorithm to output a description of the set
of all hypotheses consistent with the training examples.
import pandas as pd
def find_s_algorithm(file_path):
data = pd.read_csv(file_path)
print("Training data:")
print(data)
attributes = data.columns[:-1]
class_label = data.columns[-1]
if row[class_label] == 'Yes':
hypothesis[i] = value
else:
hypothesis[i] = '?'
return hypothesis
file_path = 'TD/training_data.csv'
hypothesis = find_s_algorithm(file_path)
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
data = np.random.rand(100)
distances.sort(key=lambda x: x[0])
k_nearest_neighbors = distances[:k]
train_data = data[:50]
train_labels = labels
test_data = data[50:]
results = {}
for k in k_values:
print(f"Results for k = {k}:")
classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for test_point in
test_data]
results[k] = classified_labels
for k in k_values:
classified_labels = results[k]
class1_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] == "Class1"]
class2_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] == "Class2"]
plt.figure(figsize=(10, 6))
plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else "red" for label
in train_labels],
label="Training Data", marker="o")
plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1 (Test)",
marker="x")
plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2 (Test)", marker="x")
Training dataset: First 50 points labeled based on the rule (x <= 0.5 -> Class1, x > 0.5 -> Class2)
Results for k = 1:
Results for k = 2:
Results for k = 3:
Results for k = 4:
Results for k = 5:
Classification complete.
Program 6: Implement the non-parametric Locally Weighted Regression
algorithm in order to fit data points. Select appropriate data set for your
experiment and draw graphs