0% found this document useful (0 votes)
10 views41 pages

BDA Experiments

The document outlines several implementations related to big data analytics, including the Apriori algorithm for association rule mining, windowing techniques with k-means clustering, and a network intrusion detection system using the Flajolet-Martin algorithm. It also details the process of scraping Twitter data for analysis and developing a system for finding frequently purchased products from market basket transaction data. Each section includes code snippets and explanations for executing the respective tasks.

Uploaded by

sachinjdh31
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views41 pages

BDA Experiments

The document outlines several implementations related to big data analytics, including the Apriori algorithm for association rule mining, windowing techniques with k-means clustering, and a network intrusion detection system using the Flajolet-Martin algorithm. It also details the process of scraping Twitter data for analysis and developing a system for finding frequently purchased products from market basket transaction data. Each section includes code snippets and explanations for executing the respective tasks.

Uploaded by

sachinjdh31
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 41

BIG DATA

ANALYTICS
EXPERIMEN
TS ================================

============================

Sachin Kuldeep
12212112
CS-B 06
1. Implementation of Apriori algorithm.
CODE:
import csv
from itertools import combinations
# Load dataset correctly
def load_transactions(file_path):
transactions = []
with open(file_path, "r") as file:
reader = csv.reader(file)
headers = next(reader) # Read headers
for row in reader:
transaction = set()
for i in range(1, len(headers)): # Skip first column if it's an index
value = row[i].strip().lower()
if value in ["true", "1", "yes"]: # Handle different truthy values
transaction.add(headers[i])
transactions.append(transaction)
return transactions
def get_frequent_itemsets(transactions, min_support):
itemsets = {}
# Count occurrences of single items
for transaction in transactions:
for item in transaction:
itemsets[frozenset([item])] = itemsets.get(frozenset([item]), 0) + 1
itemsets = {k: v for k, v in itemsets.items() if v >= min_support}
k=2
frequent_itemsets = dict(itemsets)
while True:
new_itemsets = {}
prev_itemsets = list(itemsets.keys())
for i in range(len(prev_itemsets)):
for j in range(i + 1, len(prev_itemsets)):
combined = prev_itemsets[i] | prev_itemsets[j]
if len(combined) == k:
count = sum(1 for transaction in transactions if
combined.issubset(transaction))
if count >= min_support:
new_itemsets[combined] = count
if not new_itemsets:
break
frequent_itemsets.update(new_itemsets)
itemsets = new_itemsets
k += 1
return frequent_itemsets
def generate_association_rules(frequent_itemsets, min_confidence):
rules = []
for itemset, support in frequent_itemsets.items():
if len(itemset) < 2:
continue # Rules need at least 2 items
for i in range(1, len(itemset)):
for left in combinations(itemset, i):
left = frozenset(left)
right = itemset - left
if right:
left_support = frequent_itemsets.get(left, 1)
confidence = support / left_support
if confidence >= min_confidence:
rules.append((set(left), set(right), confidence))
return rules
if __name__ == "__main__":
file_path = r"C:\Users\Lenovo\OneDrive\Desktop\Image Processing
Lab\Apriori_dataset2.csv" # Adjust path as needed
transactions = load_transactions(file_path)
min_support = int(input("Enter minimum support count: "))
min_confidence = float(input("Enter minimum confidence (0 to 1): "))
frequent_itemsets = get_frequent_itemsets(transactions,
min_support)
association_rules = generate_association_rules(frequent_itemsets,
min_confidence)
print("\nAssociation Rules:")
for left, right, confidence in association_rules:
print(f"{left} => {right} (Confidence: {confidence:.2f})")
FIGURE:

2. Implementation of Apriori algorithm.


CODE:
import pandas as pd
from itertools import combinations
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
# Load dataset
file_path = "DBLP10k.csv" # Update with actual file path
df = pd.read_csv(file_path, delimiter=";", encoding="utf-8")
# Identify the correct columns
expected_columns = ["author1", "author2"]
if not all(col in df.columns for col in expected_columns):
raise KeyError("Dataset must contain 'author1' and 'author2'
columns.")
# Group co-authors into transactions, considering all co-authors in a
paper
all_transactions = []
for _, row in df.iterrows():
authors = [row['author1'], row['author2']]
# Remove NaN values and create a transaction
transaction = [author.strip() for author in authors if isinstance(author,
str) and
author.strip()]
if len(transaction) > 1: # Only consider papers with multiple authors
all_transactions.append(transaction)
# Convert transactions to a format suitable for Apriori
te = TransactionEncoder()
te_ary = te.fit(all_transactions).transform(all_transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
# Apply Apriori algorithm to find frequent co-author pairs
min_support = 0.001 # Adjust based on dataset size
frequent_itemsets = apriori(df_encoded, min_support=min_support,
use_colnames=True)
# Generate association rules from frequent co-author pairs
rules = association_rules(frequent_itemsets, metric="confidence",
min_threshold=0.3)
# Display top frequent co-author relationships
print("Frequent Coauthor Groups:")
print(frequent_itemsets.sort_values(by='support',
ascending=False).head(10))
print("\nStrong Coauthor Association Rules:")
print(rules.sort_values(by=['confidence', 'lift'], ascending=[False,
False]).head(10))
FIGURE:

3. Implement windowing techniques and k-means


clustering using the provided dataset
CODE:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score,
calinski_harabasz_score
from mpl_toolkits.mplot3d import Axes3D
df = pd.read_csv("Data_Kopi Aroma Sweet.csv", delimiter=";")
df.info()
print(df.head())
print(df.isnull().sum())
print(df.describe())
window_sizes = [5, 10, 20]
steps = [2, 5, 10]
windowing_techniques = ['sliding', 'expanding']
num_clusters = 3 # Adjust as needed
results = []
for windowing in windowing_techniques:
for window_size in window_sizes:
for step in steps:
# Extract different data windows
if windowing == 'sliding':
windows = [df.iloc[i:i + window_size] for i in range(0, len(df) -
window_size + 1, step)]
elif windowing == 'expanding':
windows = [df.iloc[:i + window_size] for i in range(0, len(df) -
window_size + 1, step)]
features = np.array([window[['MQ7', 'MQ3', 'MQ8']].mean().values
for
window in windows])
kmeans = KMeans(n_clusters=num_clusters, random_state=42,
n_init=10)
labels = kmeans.fit_predict(features)
silhouette = silhouette_score(features, labels) if len(set(labels)) > 1
else
None
davies_bouldin = davies_bouldin_score(features, labels) if
len(set(labels)) > 1 else None
calinski_harabasz = calinski_harabasz_score(features, labels) if
len(set(labels)) > 1 else None
results.append((windowing, window_size, step, silhouette,
davies_bouldin, calinski_harabasz))
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(features[:, 0], features[:, 1], features[:, 2],
c=labels,
cmap='viridis', s=50)
ax.set_xlabel('MQ7')
ax.set_ylabel('MQ3')
ax.set_zlabel('MQ8')
ax.set_title(f'K-Means Clustering (Windowing: {windowing}, Size:
{window_size}, Step: {step})')
plt.colorbar(scatter, label='Cluster')
plt.show()
results_df = pd.DataFrame(results, columns=['Windowing Technique',
'Window
Size', 'Step', 'Silhouette Score', 'Davies-Bouldin Score', 'Calinski-
Harabasz
Score'])
print(results_df)
4. Q. Develop a system for detecting malicious activities in the
network traffic. For this purpose follow the following steps: a.
Download the Network Intrusion Detection dataset from
www.kaggle.com b. Extract the data related to source and
destination IP address c. Consider the source IP address as a data
stream d. Apply the FM Algorithm on the stream in a sliding window
fashion and detect the windows that have a possibility of attack. The
hash function can be selected on your own. Analyze the performance
of your system in terms of accuracy of detection and
accuracy of counting distinct elements. f. Analyze the effect of
window size and of different hash functions on the performance.
import pandas as pd
import numpy as np
import hashlib
import random
from collections import deque
from sklearn.metrics import accuracy_score
import os
import matplotlib.pyplot as plt
data_path = '/content/drive/MyDrive/archive (3)'
def load_dataset(file_name):
file_path = os.path.join(data_path, file_name)
df = pd.read_csv(file_path)
print("Columns in dataset:", df.columns)
return df[['src_bytes', 'dst_bytes']]
def hash_function(value, seed):
hash_obj = hashlib.md5((str(value) + str(seed)).encode())
binary_hash = bin(int(hash_obj.hexdigest(), 16))[2:].zfill(128)
return binary_hash
def trailing_zeros(binary_str):
return len(binary_str) - len(binary_str.rstrip('0'))
def flajolet_martin(values, num_hashes=10):
max_trailing_zeros = [0] * num_hashes
for value in values:
for i in range(num_hashes):
hashed_value = hash_function(value, i)
max_trailing_zeros[i] = max(max_trailing_zeros[i],
trailing_zeros(hashed_value))
estimates = [2 ** r for r in max_trailing_zeros]
return int(np.median(estimates))
def sliding_window_detection(df, window_size=100, threshold=50,
num_hashes=10):
window = deque()
detected_windows = []
estimated_counts = []
for index, row in df.iterrows():
src_value = row['src_bytes']
window.append(src_value)
if len(window) > window_size:
window.popleft()
estimated_count = flajolet_martin(window, num_hashes)
estimated_counts.append(estimated_count)
if estimated_count > threshold:
detected_windows.append((index, estimated_count))
return detected_windows, estimated_counts
def evaluate_accuracy(actual_counts, estimated_counts):
return accuracy_score(actual_counts, estimated_counts)
def analyze_performance(df, window_sizes, hash_functions,
threshold=50):
results = {}
for w in window_sizes:
for h_func in hash_functions:
detected_windows, estimated_counts =
sliding_window_detection(df,
window_size=w, threshold=threshold, num_hashes=10)
results[(w, h_func.__name__)] = len(detected_windows)
plt.plot(estimated_counts, label=f'Window Size {w}, Hash
{h_func.__name__}')
plt.xlabel('Window Index')
plt.ylabel('Estimated Distinct Elements')
plt.legend()
plt.title('Effect of Window Size and Hash Functions')
plt.show()
return results
df_train = load_dataset('Train_data.csv')
df_test = load_dataset('Test_data.csv')
detected_train, estimated_train =
sliding_window_detection(df_train,
window_size=100, threshold=50, num_hashes=10)
detected_test, estimated_test = sliding_window_detection(df_test,
window_size=100, threshold=50, num_hashes=10)
print("Detected potential attack windows in Train Data:",
detected_train)
print("Detected potential attack windows in Test Data:",
detected_test)
window_sizes = [50, 100, 200]
hash_functions = [hashlib.md5, hashlib.sha1]
performance_results = analyze_performance(df_train, window_sizes,
hash_functions)
print("Performance Analysis:", performance_results)
5. Q. Analyzing the Twitter Data. For this purpose follow the following
steps: a. Create
developer’s account on twitter. One account is sufficient across 8 to
10 students. b.
Scrap the twitter data using tweepy library. It must be done using five
keywords of
your choice. c. Use a word embedding such as GloVe and FastText to
convert each
tweet into vector form d. Perform the K-means clustering using a
cosine similarity
measure
CODE:
import tweepy
import pandas as pd
import time
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Twitter API Authentication


bearer_token =
"AAAAAAAAAAAAAAAAAAAAABcI0AEAAAAAjilBsLuC%2FAY
%2BmFqmEEZ
Dw38HjXk
%3D1wAiZyoCQcqAgfg8WD3Zhbpp4JZL8IddBcZMmsykKLhtJXJYKs
"
client = tweepy.Client(bearer_token=bearer_token)
# Test API Connection
Try
client.get_user(username="Twitter") # Test authentication
print("Authentication Successful!")
except tweepy.TweepyException as e:
print("Authentication Failed:", e)
# Keywords for scraping tweets
keywords = ["AI", "Machine Learning", "Deep Learning", "Data
Science",
"NLP"]
tweets_data = []
for keyword in keywords:
while True:
try:
print(f"Fetching tweets for: {keyword}")
response = client.search_recent_tweets(query=keyword,
max_results=10, tweet_fields=["text"])
if response and response.data: # Ensure response is not None
for tweet in response.data:
tweets_data.append(tweet.text)
else:
print(f"No tweets found for {keyword}")
time.sleep(10) # Avoid rate limits
break # Exit loop if successful
except tweepy.TweepyException as e:
print(f"Error fetching tweets for {keyword}: {e}")
if "Rate limit exceeded" in str(e):
print("Rate limit reached! Waiting for 15 minutes...")
time.sleep(900) # Wait for 15 minutes
else:
break # Exit loop on other errors
# Convert collected tweets to DataFrame
df = pd.DataFrame(tweets_data, columns=["Tweet"])
# Save to CSV
df.to_csv("tweets.csv", index=False)
print("Scraped Tweets Saved Successfully!")
# Load Pre-trained Word Embedding Model (Manually)
def load_glove_model(file_path):
glove_model = {}
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
values = line.split()
word = values[0]
vector = np.array(values[1:], dtype="float32")
glove_model[word] = vector
return glove_model
# Load GloVe embeddings from file (ensure the correct path)
glove_model = load_glove_model("glove.6B.50d.txt")
# Function to get embeddings
def get_embedding(sentence, model):
words = sentence.split()
vectors = [model[word] for word in words if word in model]
return np.mean(vectors, axis=0) if vectors else np.zeros(50) # 50D for
GloVe
df["Vector"] = df["Tweet"].apply(lambda x: get_embedding(x,
glove_model))
# Prepare data for clustering
X = np.vstack(df["Vector"].values)
# Apply K-means clustering
k=5
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df["Cluster"] = kmeans.fit_predict(X)
print(df.head()) # Check assigned clusters
# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df["Cluster"], cmap="viridis")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title("Tweet Clusters")
plt.show()
FIGURE:
Q. Develop a system for finding the frequently purchased products
based on the
market basket transaction data stream. For this purpose follow the
following steps: a.
Download the Groceries Market Basket Dataset from the
www.kaggle.com b.
Develop an algorithm by combining the concept of data windows and
the Apriori
algorithm for processing the given data stream. c. Compare the
frequent item sets
generated by your algorithm deployed in the streaming environment
and the
traditional Apriori algorithm.
CODE:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from collections import deque
df = pd.read_csv("groceries - groceries.csv")
txns = df.iloc[:, 1:].apply(lambda x: x.dropna().tolist(), axis=1).tolist()
w_size = 100
dq = deque(maxlen=w_size)
def process_txns(txns):
freq_sets = []
for txn in txns:
dq.append(txn)
if len(dq) == w_size:
txn_df = pd.DataFrame(dq)
txn_enc =
pd.get_dummies(txn_df.stack()).groupby(level=0).sum().astype(bool)
freq_sets.append(apriori(txn_enc, min_support=0.01,
use_colnames=True))
return freq_sets
stream_res = process_txns(txns)
# Traditional Apriori on full dataset
full_txn_df = pd.DataFrame(txns)
full_enc =
pd.get_dummies(full_txn_df.stack()).groupby(level=0).sum().astype(b
ool)
full_freq_sets = apriori(full_enc, min_support=0.01,
use_colnames=True)
print("Streaming:")
print(stream_res[-1] if stream_res else "No frequent itemsets found
in the
window")
print("\nTraditional:")
print(full_freq_sets)
FIGURE:

6. Q. Develop a system for finding the frequently purchased products


based on the
market basket transaction data stream. For this purpose follow the
following steps: a.
Download the Groceries Market Basket Dataset from the
www.kaggle.com b.
Develop an algorithm by combining the concept of data windows and
the Apriori
algorithm for processing the given data stream. c. Compare the
frequent item sets
generated by your algorithm deployed in the streaming environment
and the
traditional Apriori algorithm.
CODE:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from collections import deque
df = pd.read_csv("groceries - groceries.csv")
txns = df.iloc[:, 1:].apply(lambda x: x.dropna().tolist(), axis=1).tolist()
w_size = 100
dq = deque(maxlen=w_size)
def process_txns(txns):
freq_sets = []
for txn in txns:
dq.append(txn)
if len(dq) == w_size:
txn_df = pd.DataFrame(dq)
txn_enc =
pd.get_dummies(txn_df.stack()).groupby(level=0).sum().astype(bool)
freq_sets.append(apriori(txn_enc, min_support=0.01,
use_colnames=True))
return freq_sets
stream_res = process_txns(txns)
# Traditional Apriori on full dataset
full_txn_df = pd.DataFrame(txns)
full_enc =
pd.get_dummies(full_txn_df.stack()).groupby(level=0).sum().astype(b
ool)
full_freq_sets = apriori(full_enc, min_support=0.01,
use_colnames=True)
print("Streaming:")
print(stream_res[-1] if stream_res else "No frequent itemsets found
in the
window")
print("\nTraditional:")
print(full_freq_sets)
FIGURE
7. Q. Develop a system for finding the frequently purchased products
based on the
market basket transaction data stream. For this purpose follow the
following steps: a.
Download the Groceries Market Basket Dataset from the
www.kaggle.com b.
Develop an algorithm by combining the concept of data windows and
the Apriori
algorithm for processing the given data stream. c. Compare the
frequent item sets
generated by your algorithm deployed in the streaming environment
and the
traditional Apriori algorithm.
import pandas as pd
from collections import defaultdict, deque
from itertools import combinations
def load_transactions(file_path):
df = pd.read_csv(file_path)
return df.iloc[:, 1:].apply(lambda row: row.dropna().tolist(),
axis=1).tolist()
def calculate_support(transactions, itemsets, min_support):
item_counts = defaultdict(int)
total_transactions = len(transactions)
for transaction in transactions:
for itemset in itemsets:
if set(itemset).issubset(transaction):
item_counts[itemset] += 1
return {item: count / total_transactions for item, count in
item_counts.items() if (count / total_transactions) >= min_support}
def generate_candidates(frequent_itemsets, k):
items = list(frequent_itemsets.keys())
return set(combinations(set().union(*items), k))
def apriori_algorithm(transactions, min_support):
frequent_itemsets = {}
single_items = {(item,) for transaction in transactions for item in
transaction}
frequent_itemsets[1] = calculate_support(transactions, single_items,
min_support)
k=2
while frequent_itemsets.get(k - 1):
candidates = generate_candidates(frequent_itemsets[k - 1], k)
frequent_itemsets[k] = calculate_support(transactions, candidates,
min_support)
if not frequent_itemsets[k]:
break
k += 1
result = [(itemset, support) for size, itemset_dict in
frequent_itemsets.items()
for itemset, support in itemset_dict.items()]
return pd.DataFrame(result, columns=["Itemset",
"Support"]).sort_values(by="Support", ascending=False)
def streaming_apriori(transactions, min_support, window_size):
window = deque(maxlen=window_size)
frequent_itemsets = {}
for i, transaction in enumerate(transactions):
window.append(transaction)
if i >= window_size - 1: # Start computing once the window is full
window_transactions = list(window)
frequent_itemsets = apriori_algorithm(window_transactions,
min_support)
print(f"Frequent itemsets at window {i - window_size + 2} to {i + 1}:")
print(frequent_itemsets.head(5))
return frequent_itemsets
if __name__ == "__main__":
file_path = "groceries - groceries.csv" # Update path as needed
transactions = load_transactions(file_path)
min_support = 0.02 # Adjust threshold as needed
print("Traditional Apriori Results:")
frequent_itemsets_traditional = apriori_algorithm(transactions,
min_support)
print(frequent_itemsets_traditional.head(10))
print("\nStreaming Apriori Results:")
window_size = 135 # Define the window size
frequent_itemsets_streaming = streaming_apriori(transactions,
min_support, window_size)
FIGURE:
8. Extract the clusters representing customer groups based on their
credit card
spending data. Consider the possibility that clusters may exist in
different subspaces.
For this purpose follow the following steps: a. Download the Credit
Card Data from
the www.kaggle.com b. Apply the PROCLUS clustering techniques for
finding
clusters prevailing in different subspaces. c. Find the cluster quality in
terms of Dunn
Index and DB index
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import euclidean
from proclus import proclus # Ensure this is correctly imported
# Load dataset
df = pd.read_csv("BankChurners.csv")
# Drop irrelevant columns
if 'CLIENTNUM' in df.columns:
df.drop(columns=['CLIENTNUM'], inplace=True)
# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
# Normalize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)
# Apply PROCLUS clustering
num_clusters = 5 # Adjust based on dataset
initial_medoids = list(np.random.choice(len(data_scaled),
num_clusters,
replace=False))
proclus_instance = proclus(data_scaled, k=num_clusters, l=5) #
Ensure
correct parameters
Mcurr, Dis, cluster_labels = proclus_instance
# Evaluate Clustering Quality
def dunn_index(data, labels):
clusters = {i: [] for i in np.unique(labels)}
for index, label in enumerate(labels):
clusters[label].append(index)
intra_distances = []
inter_distances = []
for cluster in clusters.values():
intra_distances.append(max([euclidean(data[i], data[j]) for i in cluster
for j
in cluster if i != j] or [0]))
for i, cluster1 in enumerate(clusters.values()):
for j, cluster2 in enumerate(clusters.values()):
if i < j:
inter_distances.append(min([euclidean(data[i], data[j]) for i in
cluster1 for j in cluster2]))
return min(inter_distances) / max(intra_distances)
dunn = dunn_index(data_scaled, cluster_labels)
db_index = davies_bouldin_score(data_scaled, cluster_labels)
print(f"Dunn Index: {dunn}")
print(f"Davies-Bouldin Index: {db_index}")
PROCLUS.PY(Code)::
# -*- encoding: utf-8 -*-
import numpy as np
import arff
import matplotlib.pyplot as plt
import ipdb
from scipy.spatial.distance import pdist, squareform
def greedy(X, S, k):
M = [np.random.permutation(S)[0]]
A = np.setdiff1d(S, M)
dists = np.zeros(len(A))
for i in range(len(A)):
dists[i] = np.linalg.norm(X[A[i]] - X[M[0]])
for i in range(1, k):
midx = np.argmax(dists)
mi = A[midx]
M.append(mi)
for j in range(len(A)):
dists[j] = min(dists[j], np.linalg.norm(X[A[j]] - X[mi]))
A = np.delete(A, midx)
dists = np.delete(dists, midx)
return np.array(M)
def findDimensions(X, k, l, L, Mcurr):
N, d = X.shape
Dis, Zis, Rem, Mselidx = [], [], [], []
for i in range(len(Mcurr)):
mi = Mcurr[i]
Xij = np.abs(X[L[i]] - X[mi]).sum(axis=0) / len(L[i])
Yi = Xij.sum() / d
Di = []
si = np.sqrt(((Xij - Yi)**2).sum() / (d-1))
Zij = (Xij - Yi) / si
o = np.argsort(Zij)
Di.append(o[0])
Di.append(o[1])
Dis.append(Di)
for j in range(2, d):
Zis.append(Zij[o[j]])
Rem.append(o[j])
Mselidx.append(i)
if l != 2:
o = np.argsort(Zis)
nremaining = k * l - k * 2
j=0
while nremaining > 0:
midx = Mselidx[o[j]]
Dis[midx].append(Rem[o[j]])
j += 1
nremaining -= 1
return Dis
def manhattanSegmentalDist(x, y, Ds):
return sum(np.abs(x[d] - y[d]) for d in Ds) / len(Ds)
def assignPoints(X, Mcurr, Dis):
assigns = np.full(X.shape[0], -1)
for i in range(X.shape[0]):
minDist = np.inf
best = -1
for j in range(len(Mcurr)):
dist = manhattanSegmentalDist(X[i], X[Mcurr[j]], Dis[j])
if dist < minDist:
minDist = dist
best = Mcurr[j]
assigns[i] = best
return assigns
def evaluateClusters(X, assigns, Dis, Mcurr):
upperSum = 0.0
for i in range(len(Mcurr)):
C = X[np.where(assigns == Mcurr[i])[0]]
Cm = C.sum(axis=0) / C.shape[0]
Ysum = sum(np.sum(np.abs(C[:, d] - Cm[d])) / C.shape[0] for d in
Dis[i])
wi = Ysum / len(Dis[i])
upperSum += C.shape[0] * wi
return upperSum / X.shape[0]
def computeBadMedoids(X, assigns, Dis, Mcurr, minDeviation):
N, d = X.shape
k = len(Mcurr)
Mbad = []
counts = [len(np.where(assigns == i)[0]) for i in Mcurr]
cte = int(np.ceil((N / k) * minDeviation))
Mbad.append(Mcurr[np.argsort(counts)[0]])
for i in range(len(counts)):
if counts[i] < cte and Mcurr[i] not in Mbad:
Mbad.append(Mcurr[i])
return Mbad
def proclus(X, k=2, l=3, minDeviation=0.1, A=30, B=3, niters=30,
seed=1234):
np.random.seed(seed)
N, d = X.shape
if B > A:
raise Exception("B has to be smaller than A.")
if l < 2:
raise Exception("l must be >=2.")
idxs = np.arange(N)
np.random.shuffle(idxs)
S = idxs[: (A * k)]
M = greedy(X, S, B * k)
BestObjective = np.inf
Mcurr = np.random.permutation(M)[:k]
Mbest = None
D = squareform(pdist(X))
for it in range(niters):
L = [np.where(D[mi] <= D[mi, np.setdiff1d(Mcurr, mi)].min())[0] for mi
in
Mcurr]
Dis = findDimensions(X, k, l, L, Mcurr)
assigns = assignPoints(X, Mcurr, Dis)
ObjectiveFunction = evaluateClusters(X, assigns, Dis, Mcurr)
badM = computeBadMedoids(X, assigns, Dis, Mcurr, minDeviation) if
ObjectiveFunction < BestObjective else []
if ObjectiveFunction < BestObjective:
BestObjective = ObjectiveFunction
Mbest = Mcurr.copy()
if badM:
Mavail = np.setdiff1d(M, Mbest)
newSel = np.random.choice(Mavail, size=len(badM), replace=False)
Mcurr = np.setdiff1d(Mbest, badM)
Mcurr = np.union1d(Mcurr, newSel)
return Mcurr, Dis, assigns
FIGURE

You might also like