-
Notifications
You must be signed in to change notification settings - Fork 1
/
0502_READING_NEW_PAPERS_MALLET.py
174 lines (120 loc) · 5.81 KB
/
0502_READING_NEW_PAPERS_MALLET.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import numpy as np
df_1 = pd.read_csv('data/articles1.csv')
df_2 = pd.read_csv('data/articles2.csv')
df_2.head()
# #### let's select the first 50 new papers
new_titles = df_2['title'][:50].array
new_papers = df_2['content'][:50].array
new_titles[34]
# +
import pickle
import nltk
import gensim
dictionary = gensim.corpora.Dictionary.load('models/dictionary.gensim')
with open("lists/bow_corpus.txt", "rb") as fp: # Unpickling
bow_corpus = pickle.load(fp)
with open("lists/norm_corpus_bigrams.txt", "rb") as fp:
norm_corpus_bigrams = pickle.load(fp)
with open("lists/norm_papers.txt", "rb") as fp:
norm_papers = pickle.load(fp)
with open("lists/pre_papers.txt", "rb") as fp:
pre_papers = pickle.load(fp)
with open("lists/pre_titles.txt", "rb") as fp:
pre_titles = pickle.load(fp)
# -
# ### PREPROCESS NEW PAPERS
# first preprcoess these new papers and extract features using the same sequence of steps we followed when building the topic models.
#
# +
# %%time
import nltk
stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()
def normalise_corpus(papers, titles):
norm_papers = []
pre_papers = []
pre_titles = []
for i in range(len(papers)):
paper = papers[i]
title = titles[i]
paper = paper.lower()
paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
paper_tokens = [token for token in paper_tokens if len(token) > 1]
paper_tokens = [token for token in paper_tokens if token not in stop_words]
paper_tokens = list(filter(None, paper_tokens))
if paper_tokens:
norm_papers.append(paper_tokens)
pre_papers.append(paper)
pre_titles.append(title)
return norm_papers, pre_papers, pre_titles
# we have pre_papers and pre_titles because the normalizing function removes empty papers and titles
# so for consistency the papers and titles that we perform LDA on will be kept
# -
# let's create a text wrangling and feature engineering pipeline, which should match the same steps we followed when training our topic model.
#
bigram_model = gensim.models.phrases.Phraser.load('models/bigram_model.gensim')
# +
def text_preprocessing_pipeline(documents, normaliser_fn, bigram_model, titles):
norm_docs, pre_papers, pre_titles = normaliser_fn(documents, titles)
norm_docs_bigrams = bigram_model[norm_docs]
return norm_docs_bigrams, pre_papers, pre_titles
def bow_features_pipeline(tokenized_docs, dictionary):
paper_bow_features = [dictionary.doc2bow(text)
for text in tokenized_docs]
return paper_bow_features
norm_new_papers, new_pre_papers, new_pre_titles = text_preprocessing_pipeline(documents=new_papers,
normaliser_fn=normalise_corpus,
bigram_model=bigram_model,
titles=new_titles)
norm_bow_features = bow_features_pipeline(tokenized_docs=norm_new_papers,
dictionary=dictionary)
# -
print(norm_new_papers[0][:30])
print(norm_bow_features[0][:30])
# ### LOAD MALLET MODEL AND MAKE PREDICTIONS
# +
TOPICS = 25
load_lda_model = gensim.models.wrappers.LdaMallet.load('models/mallet/model_'+str(TOPICS)+'.gensim')
# convert the ldaMallet to LdaModel. It was the only way to get some result with loading mallet model.
load_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(load_lda_model)
topics = [[(term, round(wt, 3))
for term, wt in load_lda_model.show_topic(n, topn=20)]
for n in range(0, load_lda_model.num_topics)]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
for topic in topics],
columns = ['Terms per Topic'],
index=['Topic'+str(t) for t in range(1, load_lda_model.num_topics+1)]
)
# -
topics_df
def get_topic_predictions(topic_model, corpus, topn=3):
topic_predictions = topic_model[corpus]
best_topics = [[(topic, round(wt, 3))
for topic, wt in sorted(topic_predictions[i],
key=lambda row: -row[1])[:topn]]
for i in range(len(topic_predictions))]
return best_topics
topic_preds = get_topic_predictions(topic_model=load_lda_model,
corpus=norm_bow_features, topn=2)
# +
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_pre_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, wt in item] for item in topic_preds]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Contribution %'] = [topic_wt for topic_list in
[[round(wt*100, 2)
for topic_num, wt in item]
for item in topic_preds]
for topic_wt in topic_list]
results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Title'] = [new_pre_titles[i-1][:200] for i in results_df.index.values]
results_df['Paper Desc'] = [new_pre_papers[i-1][:200] for i in results_df.index.values]
pd.set_option('display.max_colwidth', 300)
# -
results_df
results_df.sort_values(by='Contribution %', ascending=False)