0% found this document useful (0 votes)
18 views7 pages

Ass - 11.ipynb - Colab

The document outlines various data analysis techniques using Python, including distance metrics (Euclidean, Manhattan, Cosine, Jaccard) on the Iris dataset, density estimation methods (Parzen Window and Nearest Neighbor) on the Wine Quality dataset, and hierarchical clustering methods on customer data. It includes visualizations such as heatmaps and dendrograms to represent the results. Additionally, it discusses the implementation of agglomerative and divisive clustering methods with sample datasets.

Uploaded by

brotin2503
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views7 pages

Ass - 11.ipynb - Colab

The document outlines various data analysis techniques using Python, including distance metrics (Euclidean, Manhattan, Cosine, Jaccard) on the Iris dataset, density estimation methods (Parzen Window and Nearest Neighbor) on the Wine Quality dataset, and hierarchical clustering methods on customer data. It includes visualizations such as heatmaps and dendrograms to represent the results. Additionally, it discusses the implementation of agglomerative and divisive clustering methods with sample datasets.

Uploaded by

brotin2503
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

# Import necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
from sklearn.preprocessing import StandardScaler, Binarizer
from scipy.spatial.distance import pdist, squareform, jaccard

# Load the Iris dataset


iris = load_iris()
X = iris.data
feature_names = iris.feature_names

# Standardize the features


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute Euclidean distance matrix


euclidean_dist = euclidean_distances(X_scaled)

# Compute Manhattan distance matrix


manhattan_dist = manhattan_distances(X_scaled)

# Compute Cosine similarity matrix


cosine_sim = cosine_similarity(X_scaled)

# Binarize the features for Jaccard similarity


binarizer = Binarizer()
X_binarized = binarizer.fit_transform(X_scaled)

# Compute Jaccard similarity matrix


# pdist computes pairwise distances; 'jaccard' computes Jaccard distance
# Subtracting from 1 converts distance to similarity
jaccard_dist = pdist(X_binarized, metric='jaccard')
jaccard_sim = 1 - squareform(jaccard_dist)

# Function to plot heatmap


def plot_heatmap(matrix, title, labels):
plt.figure(figsize=(10, 8))
sns.heatmap(matrix, xticklabels=labels, yticklabels=labels, cmap='viridis')
plt.title(title)
plt.show()

# Plot heatmaps for each proximity matrix


plot_heatmap(euclidean_dist, 'Euclidean Distance Matrix', feature_names)
plot_heatmap(manhattan_dist, 'Manhattan Distance Matrix', feature_names)
plot_heatmap(cosine_sim, 'Cosine Similarity Matrix', feature_names)
plot_heatmap(jaccard_sim, 'Jaccard Similarity Matrix', feature_names)
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity, NearestNeighbors
from scipy.stats import gaussian_kde

# Load the Wine Quality Dataset


url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
wine_data = pd.read_csv(url, sep=';')

# Extract the "alcohol" feature


alcohol = wine_data['alcohol'].values

# Define a range of values for plotting density estimates


alcohol_range = np.linspace(alcohol.min(), alcohol.max(), 1000).reshape(-1, 1)

# Parzen Window Density Estimation with different bandwidths


bandwidths = [0.1, 0.5, 1.0]

plt.figure(figsize=(12, 8))
for h in bandwidths:
kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(alcohol.reshape(-1, 1))
log_density = kde.score_samples(alcohol_range)
plt.plot(alcohol_range, np.exp(log_density), label=f'Bandwidth h={h}')
plt.title('Parzen Window Density Estimation for "alcohol" Feature')
plt.xlabel('Alcohol')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

# Nearest Neighbor Density Estimation with different k values


k_values = [5, 10, 20]

plt.figure(figsize=(12, 8))
for k in k_values:
nbrs = NearestNeighbors(n_neighbors=k).fit(alcohol.reshape(-1, 1))
distances, _ = nbrs.kneighbors(alcohol_range)
# Volume of the 1D ball is 2*distance in 1D
density = k / (len(alcohol) * 2 * distances[:, -1])
plt.plot(alcohol_range, density, label=f'k={k}')
plt.title('Nearest Neighbor Density Estimation for "alcohol" Feature')
plt.xlabel('Alcohol')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
Hierarchical Clustering

!pip install pandas seaborn matplotlib scipy scikit-learn


!pip install scikit-misc
!pip install scikit-extra
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import pairwise_distances
#from sklearn_extra.cluster import KMedoids

# Load the dataset


url = "https://raw.githubusercontent.com/ShubhendraChoubey/Mall_Customers/main/Mall_Customers.csv"
data = pd.read_csv(url)

# Display the first few rows of the dataframe to understand the structure
print(data.head())

# Selecting relevant features


features = data[['Annual Income (k$)', 'Spending Score (1-100)']]
features = StandardScaler().fit_transform(features) # Scale the data

# ------------ Agglomerative Hierarchical Clustering ------------


def plot_dendrogram(linkage_matrix, title):
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title(title)
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Different linkage methods


linkage_methods = ['single', 'complete', 'average']
for method in linkage_methods:
Z = linkage(features, method=method)
plot_dendrogram(Z, title=f'Dendrogram - {method.capitalize()} Linkage')

# Agglomerative Clustering
for method in linkage_methods:
cluster_model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage=method)
cluster_labels = cluster_model.fit_predict(features)

# Add cluster labels to original dataframe


data[f'Cluster_{method}'] = cluster_labels
plt.figure(figsize=(10, 6))
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], c=cluster_labels, cmap='viridis')
plt.title(f'Agglomerative Clustering - {method.capitalize()} Linkage')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

# ------------ Divisive Clustering with ISODATA ------------

def isodata_clustering(features, threshold):


# Initialize with one cluster
current_clusters = [features]
clusters = []

while current_clusters:
cluster = current_clusters.pop(0)
clusters.append(cluster)

# Split the cluster if it has more than 1 point


if len(cluster) > 1:
# Calculate distance matrix
dist = pairwise_distances(cluster)
# Get the average distance
mean_distance = np.mean(dist)

# If the mean distance is greater than the threshold, split


if mean_distance > threshold:
# K-means with n_clusters=2 for split
kmedoids = KMedoids(n_clusters=2)
labels = kmedoids.fit_predict(cluster)

# Assign points to respective new clusters


new_clusters = [cluster[labels == i] for i in range(2)]
current_clusters.extend(new_clusters)

return clusters

# Experiment with different merge/split thresholds

You might also like