forked from JasonKessler/scattertext
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_four_square.py
62 lines (56 loc) · 2.7 KB
/
demo_four_square.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import time
import pandas as pd
import scattertext as st
t0 = time.time()
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(st.whitespace_nlp_with_sentences)
full_corpus = (st.CorpusFromParsedDocuments(reviews_df,
category_col='category',
parsed_col='parse',
#feats_from_spacy_doc=st.PhraseMachinePhrases()
).build())
term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus
.keep_only_these_categories(['Accept, Positive', 'Accept, Negative',
'Reject, Positive', 'Reject, Negative'],
False)
.get_unigram_corpus()
.select(st.ClassPercentageCompactor(term_count=5)))
print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(full_corpus, starting_count=0.01)
.use_all_categories()
.get_priors())
print('building four square', time.time() - t0, 's')
four_square = st.FourSquare(
corpus,
category_a_list=['Accept, Positive'],
not_category_a_list=['Reject, Negative'],
category_b_list=['Accept, Negative'],
not_category_b_list=['Reject, Positive'],
term_ranker=term_ranker,
scorer=st.LogOddsRatioInformativeDirichletPrior(priors, 500, 'word'),
labels={'a': 'Positive Reviews of Accepted Papers',
'b': 'Negative Reviews of Accepted Papers',
'not_a_and_not_b': 'Rejections',
'a_and_b': 'Acceptances',
'a_and_not_b': 'Positive Reviews',
'b_and_not_a': 'Negative Reviews',
'not_a': 'Negative Reviews of Rejected Papers',
'not_b': 'Positive Reviews of Rejected Papers',
}
)
print('making html', time.time() - t0, 's')
html = st.produce_four_square_explorer(four_square=four_square,
x_label='Pos-Neg',
y_label='Accept-Reject',
num_terms_semiotic_square=5,
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
term_ranker=term_ranker,
metadata=(corpus._df['category'] + ': '
+ corpus._df.rating + ', '
+ corpus._df['title']))
fn = 'demo_four_square.html'
open(fn, 'wb').write(html.encode('utf-8'))
print('Open ' + fn + ' in Chrome or Firefox.')
print('done', time.time() - t0, 's')