Code
Code
import pandas as pd
f="C:/MP-CHEW/CHEW/cycle_2/lca.tsv"
df=pd.read_csv(f,sep="\t")#.sample(2000) #you can just sample here for testing
purposes
df["proteins"]=df["proteins"].str.split(", ")
edf=df.explode("proteins")
edf["OX"]=edf["proteins"].str.split("_").apply(lambda x: "_".join(x[-3:])) #these
are the taxonomies (GTDB taxid instead of NCBI)
import networkx as nx
from node2vec import Node2Vec
graph=nx.from_pandas_adjacency(out)
node2vec=Node2Vec(graph,dimensions=10,walk_length=5,num_walks=20,workers=4) #not
sure what parameters I should select here
model=node2vec.fit(window=10,min_count=1)
### 2D embedding
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
# Retrieve node embeddings and corresponding subjects
node_targets = model.wv.index_to_key # list of node IDs
node_embeddings = (
model.wv.vectors
) # numpy.ndarray of size number of nodes times embeddings dimensionality
### Clustering
#plot clusters
import seaborn as sns
node_colors=np.array(sns.color_palette("Spectral",n_colors=cluster_count))
[clusters]
plt.scatter( node_embeddings_2d[:, 0],
node_embeddings_2d[:, 1],
c=node_colors,)
#plot intensity
plt.scatter( node_embeddings_2d[:, 0],
node_embeddings_2d[:, 1],
c=np.log(s.loc[node_targets].values)
,cmap='Spectral',s=0.2)