-
Notifications
You must be signed in to change notification settings - Fork 87
/
Copy pathfind-dynamic-topics.py
executable file
·186 lines (165 loc) · 7.6 KB
/
find-dynamic-topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
"""
Tool to generate a dynamic topic model, by combining a set of time window topic models.
Sample usage:
python find-dynamic-topics.py out/month1_windowtopics_k05.pkl out/month2_windowtopics_k08.pkl out/month3_windowtopics_k08.pkl -k 4,10 -o out -m out/w2v-model.bin
"""
import os, sys, random, operator
import logging as log
from optparse import OptionParser
import numpy as np
import sklearn.preprocessing
import text.util
import unsupervised.nmf, unsupervised.rankings, unsupervised.coherence
# --------------------------------------------------------------
class TopicCollection:
def __init__( self, top_terms = 0, threshold = 1e-6 ):
# settings
self.top_terms = top_terms
self.threshold = threshold
# state
self.topic_ids = []
self.all_weights = []
self.all_terms = set()
def add_topic_model( self, H, terms, window_topic_labels ):
'''
Add topics from a window topic model to the collection.
'''
k = H.shape[0]
for topic_index in range(k):
topic_weights = {}
# use top terms only (sparse topic representation)?
if self.top_terms > 0:
top_indices = np.argsort( H[topic_index,:] )[::-1]
for term_index in top_indices[0:self.top_terms]:
topic_weights[terms[term_index]] = H[topic_index,term_index]
self.all_terms.add( terms[term_index] )
# use dense window topic vectors
else:
total_weight = 0.0
for term_index in range(len(terms)):
total_weight += H[topic_index,term_index]
for term_index in range(len(terms)):
w = H[topic_index,term_index] / total_weight
if w >= self.threshold:
topic_weights[terms[term_index]] = H[topic_index,term_index]
self.all_terms.add( terms[term_index] )
self.all_weights.append( topic_weights )
self.topic_ids.append( window_topic_labels[topic_index] )
def create_matrix( self ):
'''
Create the topic-term matrix from all window topics that have been added so far.
'''
# map terms to column indices
all_terms = list(self.all_terms)
M = np.zeros( (len(self.all_weights), len(all_terms)) )
term_col_map = {}
for term in all_terms:
term_col_map[term] = len(term_col_map)
# populate the matrix in row-order
row = 0
for topic_weights in self.all_weights:
for term in topic_weights.keys():
M[row,term_col_map[term]] = topic_weights[term]
row +=1
# normalize the matrix rows to L2 unit length
normalizer = sklearn.preprocessing.Normalizer(norm='l2', copy=True)
normalizer.fit(M)
M = normalizer.transform(M)
return (M,all_terms)
# --------------------------------------------------------------
def main():
parser = OptionParser(usage="usage: %prog [options] window_topics1 window_topics2...")
parser.add_option("--seed", action="store", type="int", dest="seed", help="initial random seed", default=1000)
parser.add_option("-k", action="store", type="string", dest="krange", help="number of topics", default=None)
parser.add_option("--maxiters", action="store", type="int", dest="maxiter", help="maximum number of iterations", default=200)
parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="output directory (default is current directory)", default=None)
parser.add_option("-m", "--model", action="store", type="string", dest="model_path", help="path to Word2Vec model, if performing automatic selection of number of topics", default=None)
parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms to use, if performing automatic selection of number of topics", default=20)
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="display topic descriptors")
(options, args) = parser.parse_args()
if( len(args) < 2 ):
parser.error( "Must specify at least two window topic files" )
log.basicConfig(level=20, format='%(message)s')
# Parse user-specified range for number of topics K
if options.krange is None:
parser.error("Must specific number of topics, or a range for the number of topics")
kparts = options.krange.split(",")
kmin = int(kparts[0])
# Set random state
random_seed = options.seed
if random_seed < 0:
random_seed = random.randint(1,100000)
np.random.seed( random_seed )
random.seed( random_seed )
log.info("Using random seed %s" % random_seed )
# Output directory for results
if options.dir_out is None:
dir_out = os.getcwd()
else:
dir_out = options.dir_out
# Will we use automatic model selection?
validation_measure = None
if len(kparts) == 1:
kmax = kmin
else:
kmax = int(kparts[1])
# any word2vec model specified?
if not options.model_path is None:
log.info( "Loading Word2Vec model from %s ..." % options.model_path )
import gensim
model = gensim.models.Word2Vec.load(options.model_path)
validation_measure = unsupervised.coherence.WithinTopicMeasure( unsupervised.coherence.ModelSimilarity(model) )
# Process each specified window topic model
log.info("- Processing individual time window topic models ...")
collection = TopicCollection()
for window_model_path in args:
# Load the cached time window
window_name = os.path.splitext( os.path.split( window_model_path )[-1] )[0]
(doc_ids, terms, term_rankings, partition, W, H, window_topic_labels) = unsupervised.nmf.load_nmf_results( window_model_path )
log.info("Loaded %d time window topics from %s" % (len(term_rankings),window_model_path) )
collection.add_topic_model( H, terms, window_topic_labels )
# Create the topic-term matrix
M, all_terms = collection.create_matrix()
log.info( "Created topic-term matrix of size %dx%d" % M.shape )
log.debug( "Matrix stats: range=[%.3f,%.3f] mean=%.3f" % ( np.min(M), np.max(M), np.mean(M) ) )
# NMF implementation
impl = unsupervised.nmf.SklNMF( max_iters = options.maxiter, init_strategy = "nndsvd", random_seed = random_seed )
# Generate window topic model for the specified range of numbers of topics
coherence_scores = {}
for k in range(kmin,kmax+1):
log.info( "Applying dynamic topic modeling to matrix for k=%d topics ..." % k )
impl.apply( M, k )
log.info( "Generated %dx%d factor W and %dx%d factor H" % ( impl.W.shape[0], impl.W.shape[1], impl.H.shape[0], impl.H.shape[1] ) )
# Create a disjoint partition of documents
partition = impl.generate_partition()
# Create topic labels
topic_labels = []
for i in range( k ):
topic_labels.append( "D%02d" % (i+1) )
# Create term rankings for each topic
term_rankings = []
for topic_index in range(k):
ranked_term_indices = impl.rank_terms( topic_index )
term_ranking = [all_terms[i] for i in ranked_term_indices]
term_rankings.append(term_ranking)
# Print out the top terms?
if options.verbose:
log.info( unsupervised.rankings.format_term_rankings( term_rankings, top = options.top ) )
# Evaluate topic coherence of this topic model?
if not validation_measure is None:
truncated_term_rankings = unsupervised.rankings.truncate_term_rankings( term_rankings, options.top )
coherence_scores[k] = validation_measure.evaluate_rankings( truncated_term_rankings )
log.info("Model coherence (k=%d) = %.4f" % (k,coherence_scores[k]) )
# Write results
results_out_path = os.path.join( dir_out, "dynamictopics_k%02d.pkl" % (k) )
unsupervised.nmf.save_nmf_results( results_out_path, collection.topic_ids, all_terms, term_rankings, partition, impl.W, impl.H, topic_labels )
# Need to select value of k?
if len(coherence_scores) > 0:
sx = sorted(coherence_scores.items(), key=operator.itemgetter(1))
sx.reverse()
top_k = [ p[0] for p in sx ][0:min(3,len(sx))]
log.info("- Top recommendations for number of dynamic topics: %s" % ",".join(map(str, top_k)) )
# --------------------------------------------------------------
if __name__ == "__main__":
main()