-
Notifications
You must be signed in to change notification settings - Fork 0
/
LDA_tsne.py
98 lines (83 loc) · 3.59 KB
/
LDA_tsne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gensim
from gensim import corpora
import pickle
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb
import numpy as np
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
from matplotlib import cm
# %matplotlib inline
def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
'''
returns a list of n_topic strings, where each string contains the n most common
words in a predicted category, in order
'''
top_word_indices = []
for topic in range(n_topics):
temp_vector_sum = 0
for i in range(len(keys)):
if keys[i] == topic:
temp_vector_sum += document_term_matrix[i]
temp_vector_sum = temp_vector_sum.toarray()
top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
top_word_indices.append(top_n_word_indices)
top_words = []
for topic in top_word_indices:
topic_words = []
for index in topic:
temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
temp_word_vector[:,index] = 1
the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
topic_words.append(the_word.encode('ascii').decode('utf-8'))
top_words.append(" ".join(topic_words))
return top_words
def get_mean_topic_vectors(keys, two_dim_vectors):
'''
returns a list of centroid vectors from each predicted topic category
'''
mean_topic_vectors = []
for t in range(n_topics):
articles_in_that_topic = []
for i in range(len(keys)):
if keys[i] == t:
articles_in_that_topic.append(two_dim_vectors[i])
articles_in_that_topic = np.vstack(articles_in_that_topic)
mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
mean_topic_vectors.append(mean_article_in_that_topic)
return mean_topic_vectors
def get_keys(topic_matrix):
'''
returns an integer list of predicted topic
categories for a given topic matrix
'''
keys = topic_matrix.argmax(axis=1).tolist()
return keys
# colormap = np.array([
# "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
# "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
# "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
# "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
n_topics = 50
colormap = cm.get_cmap('viridis', 50)
# colormap = colormap[:n_topics]
lda = gensim.models.LdaModel.load("2021032317_Step2_clean_data_50/model5.gensim")
with open("2021032317_Step2_clean_data_50/corpus.pkl",'rb') as f:
corpus = pickle.load(f)
lda_keys = get_keys(lda)
tsne_lda_model = TSNE(n_components=2, perplexity=50, learning_rate=100,
n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsne_lda_vectors = tsne_lda_model.fit_transform(lda)
top_3_words_lda = get_top_n_words(3, lda_keys, corpus, corpus)
lda_mean_topic_vectors = get_mean_topic_vectors(lda_keys, tsne_lda_vectors)
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), plot_width=700, plot_height=700)
plot.scatter(x=tsne_lda_vectors[:,0], y=tsne_lda_vectors[:,1], color=colormap[lda_keys])
for t in range(n_topics):
label = Label(x=lda_mean_topic_vectors[t][0], y=lda_mean_topic_vectors[t][1],
text=top_3_words_lda[t], text_color=colormap[t])
plot.add_layout(label)
show(plot)