-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspacy_formatter.py
48 lines (36 loc) · 1.5 KB
/
spacy_formatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import streamlit as st
import spacy
nlp = spacy.load('ru_core_news_md')
ENTITIES = ["PER", "ORG", "DATE", "MONEY", "GPE"]
def format_red_text(text, url=""):
return f'''<span style="background: rgb(204,34,34); padding: 0.45em 0.6em; margin: 0px 0.25em; line-height: '
f'1; border-radius: 0.35em;">{text}</span>'''
def entity_info(text, eob):
ent_info = f'''<b style="font-size:8px"> {eob}</b>''' if eob else ""
return f'''<span style="background: rgb(246, 241, 234); padding: 0.2em 0.2em; margin: 0px 0.25em; line-height: '
f'1; border-radius: 0.35em; border: 2px solid rgb(186, 0, 0);">{text}{ent_info}</span>'''
def text2tokens(text, ents):
doc = nlp(text)
space = " "
html = ""
for token in doc:
if token.ent_type_ in ents:
html += space + entity_info(token.text, False)
elif token.tag_ == "PRP" or token.pos == "NUM":
html += space + entity_info(token.text, False)
elif token.pos == "PUNCT":
html += token.text
else:
html += space + token.text # + (space+token.tag_)
return html
def format_string_as_spacy(text):
st_text = text2tokens(text, ents=ENTITIES)
st.markdown(st_text, unsafe_allow_html=True)
# @st.cache
def format_for_streamlit(text, text_file=None):
if text is None and text_file is not None:
with open(text_file, "r") as reader:
text = nlp(reader.read())
elif text is None:
return
format_string_as_spacy(text)