Python 1
Python 1
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
def remove_links(text):
return re.sub(r"http\S+","",text)
def get_topic(text):
text = [remove_links(t) for t in text]
cv = CountVectorizer(max_df=0.95,min_df=2,stop_words="english")
wt = cv.fit_transform(text)
LDA = LatentDirichletAllocation(n_components=10,random_state=42)
LDA.fit(wt)
topic_word = {}
for index in range(len(LDA.components_)):
topic_word[index] = cv.get_feature_names_out()
[int(LDA.components_[index].argsort()[-1:])]
topic_results = LDA.transform(wt)
topic_title = []
maxVal = 0
maxWord = ""
for i in range(len(topic_results)):
for j in range(len(topic_results[0])):
if maxVal < topic_results[i][j]:
maxVal = topic_results[i][j]
maxWord = topic_word[topic_results.argmax(axis=1)[i]]
return maxWord