forked from JasonKessler/scattertext
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_foreign_characteristic_frequencies.py
68 lines (58 loc) · 1.88 KB
/
demo_foreign_characteristic_frequencies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import scattertext as st
import pandas as pd
import spacy
from zipfile import ZipFile
from urllib.request import urlopen
import re
import io
nlp = spacy.blank('pl')
nlp.add_pipe('sentencizer')
with ZipFile(io.BytesIO(urlopen(
'https://klejbenchmark.com/static/data/klej_polemo2.0-in.zip'
).read())) as zf:
train_df = pd.read_csv(zf.open('train.tsv'), sep='\t')[
lambda df: df.target.isin(['__label__meta_plus_m', '__label__meta_minus_m'])
].assign(
Parse = lambda df: df.sentence.apply(nlp)
)
# Polish word frequencies from https://github.com/oprogramador/most-common-words-by-language
# This DataFrame needs to contain exactly two columns: 'word' and 'background', where word contains
# the text of the word, and background its frequency
polish_word_frequencies = pd.read_csv(
'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/pl/pl_50k.txt',
sep=' ',
names=['Word', 'Frequency']
).set_index('Word')['Frequency']
polish_stopwords = {
stopword for stopword in
urlopen(
'https://raw.githubusercontent.com/bieli/stopwords/master/polish.stopwords.txt'
).read().decode('utf-8').split('\n')
if stopword.strip()
}
not_a_word = re.compile(r'^\W+$')
corpus = st.CorpusFromParsedDocuments(
train_df,
category_col='target',
parsed_col='Parse'
).build(
).get_unigram_corpus(
).filter_out(
lambda w: w in polish_stopwords or not_a_word.match(w) is not None
).set_background_corpus(
polish_word_frequencies
).remove_infrequent_words(
minimum_term_count=20
)
html = st.produce_scattertext_explorer(
corpus,
category='__label__meta_plus_m',
category_name='Plus-M',
not_category_name='Minus-M',
minimum_term_frequency=1,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
fn = 'demo_foreign_characteristic_frequencies.html'
open(fn, 'w').write(html)
print(f'Open ./{fn}')