0% found this document useful (0 votes)

10 views7 pages

Ass6 (DMDS)

Uploaded by

Gayatri Joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views7 pages

Ass6 (DMDS)

Uploaded by

Gayatri Joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 7

Ass6

Set A

1. Write a python program to implement k-means algorithm to build prediction model

(Use Credit Card Dataset CC GENERAL.csv Download from kaggle.com)

import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd

# Load the dataset

df = pd.read_csv('CC GENERAL.csv')

# Display the first few rows of the dataset

print(df.head())

# Check for missing values and drop rows with NaN values
df.dropna(inplace=True)

# Selecting relevant features for clustering (you can choose which features to use)
features = df.drop(columns=['CUST_ID']).values # Using .values to get numpy array

# K-Means Clustering Implementation

def kmeans(X, k, max_iters=100):
# Randomly initialize the centroids
centroids = X[nm.random.choice(X.shape[0], k, replace=False)]

for _ in range(max_iters):
# Calculate distances from data points to centroids
distances = nm.linalg.norm(X[:, nm.newaxis] - centroids, axis=2)

# Assign clusters based on closest centroid

labels = nm.argmin(distances, axis=1)

# Calculate new centroids

new_centroids = nm.array([X[labels == i].mean(axis=0) for i in range(k)])

# If centroids do not change, break

if nm.all(centroids == new_centroids):
break

centroids = new_centroids

return labels, centroids

# Specify the number of clusters

k = 4 # You can adjust this based on your analysis

# Run K-Means
labels, centroids = kmeans(features, k)

# Add cluster labels to the dataframe

df['Cluster'] = labels
# Display the first few rows with cluster labels
print(df.head())

# Optional: Plotting clusters (use only 2 features for visualization)

mtp.scatter(features[:, 0], features[:, 1], c=labels, cmap='viridis', marker='o',
edgecolor='k')
mtp.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75,
marker='X') # Plot centroids
mtp.title('K-Means Clustering of Credit Card Data')
mtp.xlabel('Feature 1')
mtp.ylabel('Feature 2')
mtp.show()

2. Write a python program to implement hierarchical Agglomerative clustering

algorithm.
(Download Customer.csv dataset from github.com).

url---> https://gist.github.com/akuks/2e9b08cebef0181b583a1dff4a97f8a1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

# Load the dataset directly from the URL

url = 'Customer.csv' # Replace with the actual URL
df = pd.read_csv(url)

# Display the first few rows of the dataset

print(df.head())

# Check for missing values

print(df.isnull().sum())

# Drop rows with missing values (if any)

df.dropna(inplace=True)

# Convert DOB to age - specify the date format

df['DOB'] = pd.to_datetime(df['DOB'], format='%d/%m/%y %H:%M', dayfirst=True) #
Adjust the format for day first
df['Age'] = (pd.Timestamp.now() - df['DOB']).dt.days // 365 # Calculate age in
years
# Select relevant features for clustering
features = df[['Age', 'Gender']]

# One-hot encode the categorical 'Gender' feature

features_encoded = pd.get_dummies(features, columns=['Gender'], drop_first=True)

# Check the resulting DataFrame after encoding

print("Encoded Features:\n", features_encoded.head())
print("Columns after Encoding:", features_encoded.columns.tolist())

# Standardize the features

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)

# Perform Hierarchical Agglomerative Clustering

model = AgglomerativeClustering(n_clusters=4) # Adjust the number of clusters as
needed
labels = model.fit_predict(features_scaled)

# Add cluster labels to the original dataframe

df['Cluster'] = labels

# Display the first few rows with cluster labels

print(df.head())

# Check the unique labels

print("Unique Cluster Labels:", df['Cluster'].unique())

# Optional: Plotting the clusters (use only Age and one gender column for
visualization)
gender_column = features_encoded.columns[1] # Assuming the first column is 'Age'
plt.figure(figsize=(10, 6))
plt.scatter(df['Age'], features_encoded[gender_column], c=labels, cmap='viridis',
marker='o', edgecolor='k')
plt.title('Hierarchical Agglomerative Clustering of Customers')
plt.xlabel('Age')
plt.ylabel(gender_column) # Update the y-label to match the gender column
plt.colorbar(label='Cluster')
plt.grid(True) # Add grid for better readability
plt.show()

# Optional: Dendrogram for visualizing hierarchical clustering

plt.figure(figsize=(10, 6))
from scipy.cluster.hierarchy import dendrogram, linkage
linked = linkage(features_scaled, 'ward')
dendrogram(linked,
orientation='top',
distance_sort='descending',
show_leaf_counts=True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.grid(True) # Add grid for better readability
plt.show()
Set B
1. Write a python program to implement k-means algorithms on a synthetic dataset.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Generate a synthetic dataset

from sklearn.datasets import make_blobs

# Generate synthetic data with 3 clusters

n_samples = 300
n_features = 2
n_clusters = 3
random_state = 42

X, y = make_blobs(n_samples=n_samples, centers=n_clusters, n_features=n_features,

random_state=random_state)

# Convert to a DataFrame for easier manipulation

data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2'])

# Step 2: Visualize the synthetic dataset

plt.figure(figsize=(10, 6))
plt.scatter(data['Feature_1'], data['Feature_2'], s=30)
plt.title('Synthetic Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.show()

# Step 3: Implement K-Means algorithm

def k_means(X, n_clusters, n_iterations=100):
# Step 3.1: Randomly initialize the centroids
centroids = X.sample(n_clusters).to_numpy()

for _ in range(n_iterations):
# Step 3.2: Assign clusters based on closest centroid
distances = np.linalg.norm(X.to_numpy()[:, np.newaxis] - centroids, axis=2)
labels = np.argmin(distances, axis=1)

# Step 3.3: Update centroids based on mean of assigned points

new_centroids = np.array([X.to_numpy()[labels == k].mean(axis=0) for k in
range(n_clusters)])

# Step 3.4: Check for convergence

if np.all(centroids == new_centroids):
break

centroids = new_centroids

return labels, centroids

# Step 4: Run K-Means algorithm

labels, centroids = k_means(data, n_clusters)

# Step 5: Visualize the clustering results

plt.figure(figsize=(10, 6))
plt.scatter(data['Feature_1'], data['Feature_2'], c=labels, s=30, cmap='viridis',
marker='o', edgecolor='k')
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75,
marker='X') # Centroids
plt.title('K-Means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.show()

2. Write a python program to implement hierarchical clustering algorithm. (Download

Wholesale customers data dataset from github.com).

url----> https://github.com/TrainingByPackt/Data-Science-with-Python/blob/master/
Chapter01/Data/Wholesale%20customers%20data.csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Step 1: Load the dataset

# Replace 'path_to_your_dataset' with the actual path to your dataset
url = 'wholesale-customer.csv'
df = pd.read_csv(url)

# Step 2: Preprocessing the data

# Dropping non-numeric columns (Channel and Region)
data = df.drop(['Channel', 'Region'], axis=1)

# Step 3: Perform Hierarchical Clustering

# Linkage matrix
Z = linkage(data, method='ward')

# Step 4: Plotting the Dendrogram

plt.figure(figsize=(12, 8))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Customers')
plt.ylabel('Distance')
plt.grid()
plt.show()

# Step 5: Form clusters

# Define the number of clusters (e.g., 3)
n_clusters = 3
clusters = fcluster(Z, n_clusters, criterion='maxclust')

# Step 6: Add cluster labels to the original DataFrame

df['Cluster'] = clusters

# Step 7: Visualize the clusters

# For visualization, let's plot the first two features (Fresh and Milk)
plt.figure(figsize=(10, 6))
plt.scatter(df['Fresh'], df['Milk'], c=df['Cluster'], cmap='viridis', s=100)
plt.title('Hierarchical Clustering of Wholesale Customers')
plt.xlabel('Fresh Products')
plt.ylabel('Milk Products')
plt.grid()
plt.colorbar(label='Cluster')
plt.show()

Set C
1. Write a python program to implement Agglomerative clustering on a synthetic
dataset.
(use inbuilt Iris data set).

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

# Step 1: Load the Iris dataset

iris = load_iris()
X = iris.data # Features
y = iris.target # True labels (for comparison)

# Step 2: Perform Agglomerative Clustering

# Choose the number of clusters (e.g., 3)
n_clusters = 3
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
clusters = agg_clustering.fit_predict(X)

# Step 3: Reduce dimensions for visualization (using PCA)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Step 4: Plot the clusters

plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', marker='o',
edgecolor='k', s=100)
plt.title('Agglomerative Clustering on Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.colorbar(label='Cluster Label')
plt.show()

# Optional: Compare with true labels

plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', marker='o',
edgecolor='k', s=100)
plt.title('True Labels of Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.colorbar(label='True Label')
plt.show()

1.1 Read The Data and Do Exploratory Data Analysis. Describe The Data Briefly
100% (19)
1.1 Read The Data and Do Exploratory Data Analysis. Describe The Data Briefly
50 pages
Machine Learning - Project
80% (10)
Machine Learning - Project
14 pages
Data Mining Business Report Hansraj Yadav
83% (12)
Data Mining Business Report Hansraj Yadav
34 pages
MSC Management Dissertation Examples
100% (2)
MSC Management Dissertation Examples
8 pages
Module 6 - NC II - Presenting Relevant Information - Final
No ratings yet
Module 6 - NC II - Presenting Relevant Information - Final
57 pages
Election Prediction Projectfinal
No ratings yet
Election Prediction Projectfinal
30 pages
Jupyter Notebook Project DM Nikita Chaturvedi 25.07.2021
100% (5)
Jupyter Notebook Project DM Nikita Chaturvedi 25.07.2021
83 pages
Lesson 1 Introduction To Quantitative Research
100% (2)
Lesson 1 Introduction To Quantitative Research
25 pages
JTSS 7th June 2017
0% (1)
JTSS 7th June 2017
108 pages
SOWBHAGYA - Interim Report - Research Project Report Template
No ratings yet
SOWBHAGYA - Interim Report - Research Project Report Template
24 pages
Data Mining - Assignment: Girish Nayak
100% (1)
Data Mining - Assignment: Girish Nayak
21 pages
DBMS-II Practical Solutions
No ratings yet
DBMS-II Practical Solutions
71 pages
(Ebook PDF) Accounting Information Systems, Global Edition 15th Edition Instant Download
100% (1)
(Ebook PDF) Accounting Information Systems, Global Edition 15th Edition Instant Download
44 pages
Data Mining Assignment: Sudhanva Saralaya
100% (1)
Data Mining Assignment: Sudhanva Saralaya
16 pages
ML Solution
No ratings yet
ML Solution
60 pages
Data Mining Project Shivani Pandey
100% (1)
Data Mining Project Shivani Pandey
40 pages
Project Explanation
No ratings yet
Project Explanation
17 pages
Project Report - Data Mining
0% (1)
Project Report - Data Mining
52 pages
PATHFINDER-DNA 2025 -125_250720_103922
No ratings yet
PATHFINDER-DNA 2025 -125_250720_103922
9 pages
Bone Suplement Market Segmentation
No ratings yet
Bone Suplement Market Segmentation
20 pages
Operating Systems Credit Activity
No ratings yet
Operating Systems Credit Activity
5 pages
ML0101EN Clus K Means Customer Seg Py v1
100% (1)
ML0101EN Clus K Means Customer Seg Py v1
8 pages
MLLab Manual
No ratings yet
MLLab Manual
24 pages
Sales Data Clustering
No ratings yet
Sales Data Clustering
15 pages
Customer Segmentation Report
No ratings yet
Customer Segmentation Report
8 pages
21AI71 Module 5 Textbook
No ratings yet
21AI71 Module 5 Textbook
25 pages
23CC554
No ratings yet
23CC554
10 pages
(Ebook PDF) Accounting Information Systems, Global Edition 15th Edition Download
No ratings yet
(Ebook PDF) Accounting Information Systems, Global Edition 15th Edition Download
54 pages
Set 2
No ratings yet
Set 2
19 pages
MDG P1
No ratings yet
MDG P1
17 pages
Java Summary
No ratings yet
Java Summary
2 pages
Financial Fraud Detection System
No ratings yet
Financial Fraud Detection System
16 pages
Hierar Scale4
No ratings yet
Hierar Scale4
51 pages
Ilovepdf Merged
No ratings yet
Ilovepdf Merged
3 pages
Artificial Intelligence Report
No ratings yet
Artificial Intelligence Report
23 pages
Assignment 3
No ratings yet
Assignment 3
1 page
Economterics Final 2024.
No ratings yet
Economterics Final 2024.
32 pages
Reading Data: #Importing Required Libraries
No ratings yet
Reading Data: #Importing Required Libraries
16 pages
ML Assignment
No ratings yet
ML Assignment
11 pages
Experiment-7: Implementation of K-Means Clustering Algorithm
No ratings yet
Experiment-7: Implementation of K-Means Clustering Algorithm
3 pages
Untitled Document-2-1-13-7-11.4
No ratings yet
Untitled Document-2-1-13-7-11.4
5 pages
DWDM Lab All
No ratings yet
DWDM Lab All
20 pages
Prac7 8 9 10
No ratings yet
Prac7 8 9 10
12 pages
S6 - Data Mining Lab Experiments (Except 1)
No ratings yet
S6 - Data Mining Lab Experiments (Except 1)
6 pages
Clustering Algorithms SciKit Learn 1705740354
No ratings yet
Clustering Algorithms SciKit Learn 1705740354
22 pages
Assignment ....
No ratings yet
Assignment ....
8 pages
Lesson 6 - Unsupervised Learning
No ratings yet
Lesson 6 - Unsupervised Learning
63 pages
Marketing Analytics Week-10 LAQ
No ratings yet
Marketing Analytics Week-10 LAQ
5 pages
Trinetra Banerjee
No ratings yet
Trinetra Banerjee
9 pages
Aiml Unit 3 4
No ratings yet
Aiml Unit 3 4
19 pages
Methods of Center Measurement: X N X X X
No ratings yet
Methods of Center Measurement: X N X X X
85 pages
LP I Assignment A4 Clustering
No ratings yet
LP I Assignment A4 Clustering
13 pages
Name: Aditya Parade Roll No: 281047 PRN: 22311577 Batch: A-2 Assignment 5
No ratings yet
Name: Aditya Parade Roll No: 281047 PRN: 22311577 Batch: A-2 Assignment 5
3 pages
Data Mining Ex1
No ratings yet
Data Mining Ex1
10 pages
Joseph Xavier J - FML
No ratings yet
Joseph Xavier J - FML
15 pages
BAFBAN1 - Week 01, Presentation Deck
No ratings yet
BAFBAN1 - Week 01, Presentation Deck
60 pages
Experiment 4 1
No ratings yet
Experiment 4 1
4 pages
Anreg - StatG - (Fara, Nada, Hanan, Rey)
No ratings yet
Anreg - StatG - (Fara, Nada, Hanan, Rey)
12 pages
Compute2
No ratings yet
Compute2
10 pages
21MIC0107 Da4
No ratings yet
21MIC0107 Da4
4 pages
ML Exp5 C36
No ratings yet
ML Exp5 C36
18 pages
Zara
No ratings yet
Zara
47 pages
Kmeansclustering Sales Dataset
No ratings yet
Kmeansclustering Sales Dataset
6 pages
Hierarchical Clustering Mall Data
No ratings yet
Hierarchical Clustering Mall Data
2 pages
Clustering Algorithms CheatSheet 1710438661
No ratings yet
Clustering Algorithms CheatSheet 1710438661
6 pages
Practical 5
No ratings yet
Practical 5
6 pages
Kman 07
No ratings yet
Kman 07
9 pages
Intro Qugates
No ratings yet
Intro Qugates
4 pages
Literature Review 24MSP3077
No ratings yet
Literature Review 24MSP3077
4 pages
Data Mining
No ratings yet
Data Mining
27 pages
Python
No ratings yet
Python
5 pages
Implement Clustering Algorithms For Unsupervised Classification
No ratings yet
Implement Clustering Algorithms For Unsupervised Classification
4 pages
4 Clustering With K-Means - Kaggle
No ratings yet
4 Clustering With K-Means - Kaggle
9 pages
Customer Segmentation With K-Means and RMF
No ratings yet
Customer Segmentation With K-Means and RMF
13 pages
Assignment
No ratings yet
Assignment
7 pages
Inductive Approach
No ratings yet
Inductive Approach
4 pages
BUS - 5030 - Milestone - 2 - Worksheet (2) (1) (Repaired)
No ratings yet
BUS - 5030 - Milestone - 2 - Worksheet (2) (1) (Repaired)
12 pages
FMLASS3Q7 - Jupyter Notebook
No ratings yet
FMLASS3Q7 - Jupyter Notebook
6 pages
Lecture - 7 - Practical - DBSCAN Clustering in Python
No ratings yet
Lecture - 7 - Practical - DBSCAN Clustering in Python
3 pages
Atelier N5 PDF
No ratings yet
Atelier N5 PDF
5 pages
A Guide To The Project Management Body of Knowledg... - (APPENDIX X6 TOOLS AND TECHNIQUES)
No ratings yet
A Guide To The Project Management Body of Knowledg... - (APPENDIX X6 TOOLS AND TECHNIQUES)
10 pages
1A. Step 1: POM-QM For Windows
No ratings yet
1A. Step 1: POM-QM For Windows
9 pages
M4 Data Mining W4 Business Report
No ratings yet
M4 Data Mining W4 Business Report
22 pages
Subject: ML Name: Priyanshu Gandhi Date: 10/4/21 Expt. No.: 9 Roll No.: C008 Title: Clustering Implementation in Python
No ratings yet
Subject: ML Name: Priyanshu Gandhi Date: 10/4/21 Expt. No.: 9 Roll No.: C008 Title: Clustering Implementation in Python
7 pages
Unsupervisd Learning Algorithm
No ratings yet
Unsupervisd Learning Algorithm
6 pages
Assignment 2 QTB
No ratings yet
Assignment 2 QTB
5 pages
What Is Descriptive Analytics
No ratings yet
What Is Descriptive Analytics
4 pages
Linear Correlation (Pearson) : Assumptions
No ratings yet
Linear Correlation (Pearson) : Assumptions
2 pages
Business Statistics: Correlation Study Alumni Giving Case
No ratings yet
Business Statistics: Correlation Study Alumni Giving Case
4 pages
Estimating A VAR - Gretl
No ratings yet
Estimating A VAR - Gretl
9 pages
Handling Non-Normal Data in Structural Equation Modeling
No ratings yet
Handling Non-Normal Data in Structural Equation Modeling
5 pages
Qlik Yes and Dont
No ratings yet
Qlik Yes and Dont
1 page
Agilent Cary 8454 UV-Visible Spectroscopy System: Good Laboratory Practice
No ratings yet
Agilent Cary 8454 UV-Visible Spectroscopy System: Good Laboratory Practice
12 pages

Ass6 (DMDS)

Uploaded by

Ass6 (DMDS)

Uploaded by

Ass6

1. Write a python program to implement k-means algorithm to build prediction model

# Load the dataset

# Display the first few rows of the dataset

# K-Means Clustering Implementation

# Assign clusters based on closest centroid

# Calculate new centroids

# If centroids do not change, break

return labels, centroids

# Specify the number of clusters

# Add cluster labels to the dataframe

# Optional: Plotting clusters (use only 2 features for visualization)

2. Write a python program to implement hierarchical Agglomerative clustering

# Load the dataset directly from the URL

# Display the first few rows of the dataset

# Check for missing values

# Drop rows with missing values (if any)

# Convert DOB to age - specify the date format

# One-hot encode the categorical 'Gender' feature

# Check the resulting DataFrame after encoding

# Standardize the features

# Perform Hierarchical Agglomerative Clustering

# Add cluster labels to the original dataframe

# Display the first few rows with cluster labels

# Check the unique labels

# Optional: Dendrogram for visualizing hierarchical clustering

# Step 1: Generate a synthetic dataset

# Generate synthetic data with 3 clusters

X, y = make_blobs(n_samples=n_samples, centers=n_clusters, n_features=n_features,

# Convert to a DataFrame for easier manipulation

# Step 2: Visualize the synthetic dataset

# Step 3: Implement K-Means algorithm

# Step 3.3: Update centroids based on mean of assigned points

# Step 3.4: Check for convergence

return labels, centroids

# Step 4: Run K-Means algorithm

# Step 5: Visualize the clustering results

2. Write a python program to implement hierarchical clustering algorithm. (Download

Wholesale customers data dataset from github.com).

# Step 1: Load the dataset

# Step 2: Preprocessing the data

# Step 3: Perform Hierarchical Clustering

# Step 4: Plotting the Dendrogram

# Step 5: Form clusters

# Step 6: Add cluster labels to the original DataFrame

# Step 7: Visualize the clusters

# Step 1: Load the Iris dataset

# Step 2: Perform Agglomerative Clustering

# Step 3: Reduce dimensions for visualization (using PCA)

# Step 4: Plot the clusters

# Optional: Compare with true labels

You might also like