0% found this document useful (0 votes)
10 views7 pages

Ass6 (DMDS)

Uploaded by

Gayatri Joshi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views7 pages

Ass6 (DMDS)

Uploaded by

Gayatri Joshi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

Ass6

Set A

1. Write a python program to implement k-means algorithm to build prediction model


(Use Credit Card Dataset CC GENERAL.csv Download from kaggle.com)

import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd

# Load the dataset


df = pd.read_csv('CC GENERAL.csv')

# Display the first few rows of the dataset


print(df.head())

# Check for missing values and drop rows with NaN values
df.dropna(inplace=True)

# Selecting relevant features for clustering (you can choose which features to use)
features = df.drop(columns=['CUST_ID']).values # Using .values to get numpy array

# K-Means Clustering Implementation


def kmeans(X, k, max_iters=100):
# Randomly initialize the centroids
centroids = X[nm.random.choice(X.shape[0], k, replace=False)]

for _ in range(max_iters):
# Calculate distances from data points to centroids
distances = nm.linalg.norm(X[:, nm.newaxis] - centroids, axis=2)

# Assign clusters based on closest centroid


labels = nm.argmin(distances, axis=1)

# Calculate new centroids


new_centroids = nm.array([X[labels == i].mean(axis=0) for i in range(k)])

# If centroids do not change, break


if nm.all(centroids == new_centroids):
break

centroids = new_centroids

return labels, centroids

# Specify the number of clusters


k = 4 # You can adjust this based on your analysis

# Run K-Means
labels, centroids = kmeans(features, k)

# Add cluster labels to the dataframe


df['Cluster'] = labels
# Display the first few rows with cluster labels
print(df.head())

# Optional: Plotting clusters (use only 2 features for visualization)


mtp.scatter(features[:, 0], features[:, 1], c=labels, cmap='viridis', marker='o',
edgecolor='k')
mtp.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75,
marker='X') # Plot centroids
mtp.title('K-Means Clustering of Credit Card Data')
mtp.xlabel('Feature 1')
mtp.ylabel('Feature 2')
mtp.show()

2. Write a python program to implement hierarchical Agglomerative clustering


algorithm.
(Download Customer.csv dataset from github.com).

url---> https://gist.github.com/akuks/2e9b08cebef0181b583a1dff4a97f8a1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

# Load the dataset directly from the URL


url = 'Customer.csv' # Replace with the actual URL
df = pd.read_csv(url)

# Display the first few rows of the dataset


print(df.head())

# Check for missing values


print(df.isnull().sum())

# Drop rows with missing values (if any)


df.dropna(inplace=True)

# Convert DOB to age - specify the date format


df['DOB'] = pd.to_datetime(df['DOB'], format='%d/%m/%y %H:%M', dayfirst=True) #
Adjust the format for day first
df['Age'] = (pd.Timestamp.now() - df['DOB']).dt.days // 365 # Calculate age in
years
# Select relevant features for clustering
features = df[['Age', 'Gender']]

# One-hot encode the categorical 'Gender' feature


features_encoded = pd.get_dummies(features, columns=['Gender'], drop_first=True)

# Check the resulting DataFrame after encoding


print("Encoded Features:\n", features_encoded.head())
print("Columns after Encoding:", features_encoded.columns.tolist())

# Standardize the features


scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)

# Perform Hierarchical Agglomerative Clustering


model = AgglomerativeClustering(n_clusters=4) # Adjust the number of clusters as
needed
labels = model.fit_predict(features_scaled)

# Add cluster labels to the original dataframe


df['Cluster'] = labels

# Display the first few rows with cluster labels


print(df.head())

# Check the unique labels


print("Unique Cluster Labels:", df['Cluster'].unique())

# Optional: Plotting the clusters (use only Age and one gender column for
visualization)
gender_column = features_encoded.columns[1] # Assuming the first column is 'Age'
plt.figure(figsize=(10, 6))
plt.scatter(df['Age'], features_encoded[gender_column], c=labels, cmap='viridis',
marker='o', edgecolor='k')
plt.title('Hierarchical Agglomerative Clustering of Customers')
plt.xlabel('Age')
plt.ylabel(gender_column) # Update the y-label to match the gender column
plt.colorbar(label='Cluster')
plt.grid(True) # Add grid for better readability
plt.show()

# Optional: Dendrogram for visualizing hierarchical clustering


plt.figure(figsize=(10, 6))
from scipy.cluster.hierarchy import dendrogram, linkage
linked = linkage(features_scaled, 'ward')
dendrogram(linked,
orientation='top',
distance_sort='descending',
show_leaf_counts=True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.grid(True) # Add grid for better readability
plt.show()
Set B
1. Write a python program to implement k-means algorithms on a synthetic dataset.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Generate a synthetic dataset


from sklearn.datasets import make_blobs

# Generate synthetic data with 3 clusters


n_samples = 300
n_features = 2
n_clusters = 3
random_state = 42

X, y = make_blobs(n_samples=n_samples, centers=n_clusters, n_features=n_features,


random_state=random_state)

# Convert to a DataFrame for easier manipulation


data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2'])

# Step 2: Visualize the synthetic dataset


plt.figure(figsize=(10, 6))
plt.scatter(data['Feature_1'], data['Feature_2'], s=30)
plt.title('Synthetic Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.show()

# Step 3: Implement K-Means algorithm


def k_means(X, n_clusters, n_iterations=100):
# Step 3.1: Randomly initialize the centroids
centroids = X.sample(n_clusters).to_numpy()

for _ in range(n_iterations):
# Step 3.2: Assign clusters based on closest centroid
distances = np.linalg.norm(X.to_numpy()[:, np.newaxis] - centroids, axis=2)
labels = np.argmin(distances, axis=1)

# Step 3.3: Update centroids based on mean of assigned points


new_centroids = np.array([X.to_numpy()[labels == k].mean(axis=0) for k in
range(n_clusters)])

# Step 3.4: Check for convergence


if np.all(centroids == new_centroids):
break

centroids = new_centroids

return labels, centroids

# Step 4: Run K-Means algorithm


labels, centroids = k_means(data, n_clusters)

# Step 5: Visualize the clustering results


plt.figure(figsize=(10, 6))
plt.scatter(data['Feature_1'], data['Feature_2'], c=labels, s=30, cmap='viridis',
marker='o', edgecolor='k')
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75,
marker='X') # Centroids
plt.title('K-Means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.show()

2. Write a python program to implement hierarchical clustering algorithm. (Download

Wholesale customers data dataset from github.com).

url----> https://github.com/TrainingByPackt/Data-Science-with-Python/blob/master/
Chapter01/Data/Wholesale%20customers%20data.csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Step 1: Load the dataset


# Replace 'path_to_your_dataset' with the actual path to your dataset
url = 'wholesale-customer.csv'
df = pd.read_csv(url)

# Step 2: Preprocessing the data


# Dropping non-numeric columns (Channel and Region)
data = df.drop(['Channel', 'Region'], axis=1)

# Step 3: Perform Hierarchical Clustering


# Linkage matrix
Z = linkage(data, method='ward')

# Step 4: Plotting the Dendrogram


plt.figure(figsize=(12, 8))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Customers')
plt.ylabel('Distance')
plt.grid()
plt.show()

# Step 5: Form clusters


# Define the number of clusters (e.g., 3)
n_clusters = 3
clusters = fcluster(Z, n_clusters, criterion='maxclust')

# Step 6: Add cluster labels to the original DataFrame


df['Cluster'] = clusters

# Step 7: Visualize the clusters


# For visualization, let's plot the first two features (Fresh and Milk)
plt.figure(figsize=(10, 6))
plt.scatter(df['Fresh'], df['Milk'], c=df['Cluster'], cmap='viridis', s=100)
plt.title('Hierarchical Clustering of Wholesale Customers')
plt.xlabel('Fresh Products')
plt.ylabel('Milk Products')
plt.grid()
plt.colorbar(label='Cluster')
plt.show()

Set C
1. Write a python program to implement Agglomerative clustering on a synthetic
dataset.
(use inbuilt Iris data set).

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

# Step 1: Load the Iris dataset


iris = load_iris()
X = iris.data # Features
y = iris.target # True labels (for comparison)

# Step 2: Perform Agglomerative Clustering


# Choose the number of clusters (e.g., 3)
n_clusters = 3
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
clusters = agg_clustering.fit_predict(X)

# Step 3: Reduce dimensions for visualization (using PCA)


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Step 4: Plot the clusters


plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', marker='o',
edgecolor='k', s=100)
plt.title('Agglomerative Clustering on Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.colorbar(label='Cluster Label')
plt.show()

# Optional: Compare with true labels


plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', marker='o',
edgecolor='k', s=100)
plt.title('True Labels of Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.colorbar(label='True Label')
plt.show()

You might also like