Skip to content

Commit

Permalink
first working version of pico mesh tags
Browse files Browse the repository at this point in the history
  • Loading branch information
ijmarshall committed Jan 7, 2020
1 parent 2b3e637 commit ac9436f
Show file tree
Hide file tree
Showing 10 changed files with 335 additions and 5 deletions.
4 changes: 4 additions & 0 deletions deploy-beta.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

echo "building image"
docker build -t robotreviewer-beta .
3 changes: 3 additions & 0 deletions robotreviewer/data/minimap/cui_to_mh.pck
Git LFS file not shown
5 changes: 5 additions & 0 deletions robotreviewer/data/minimap/ignorelist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
we
a
suffer
suffering
power
98 changes: 98 additions & 0 deletions robotreviewer/data/minimap/prepositions_conjunctions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
aboard
about
above
across
after
against
along
alongside
amid
among
amongst
apropos
around
as
astride
at
bar
before
behind
below
beneath
beside
besides
between
beyond
but
by
circa
come
despite
down
during
except
for
from
in
inside
into
less
like
minus
near
nearer
nearest
notwithstanding
of
off
on
onto
opposite
out
outside
over
past
per
plus
post
pre
pro
re
sans
save
short
since
than
through
throughout
till
to
toward
towards
under
underneath
unlike
until
unto
up
upon
upside
versus
vs
v
via
with
within
without
worth
and
but
because
although
or
provided that
as long as
in order that
in order to
nor
but also
3 changes: 3 additions & 0 deletions robotreviewer/data/minimap/str_to_cui.pck
Git LFS file not shown
3 changes: 3 additions & 0 deletions robotreviewer/data/minimap/subtrees.pck
Git LFS file not shown
12 changes: 9 additions & 3 deletions robotreviewer/robots/pico_span_robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from robotreviewer.textprocessing import tokenizer
from bert_serving.client import BertClient

from robotreviewer.textprocessing import minimap
from robotreviewer.textprocessing import schwartz_hearst
log = logging.getLogger(__name__)

from celery.contrib import rdb
Expand Down Expand Up @@ -74,7 +76,7 @@ def __init__(self):
self.bert = BertClient()


def api_annotate(self, articles, get_berts=True):
def api_annotate(self, articles, get_berts=True, get_meshes=True):

if not (all(('parsed_ab' in article for article in articles)) and all(('parsed_ti' in article for article in articles))):
raise Exception('PICO span model requires a title and abstract to be able to complete annotation')
Expand All @@ -83,7 +85,7 @@ def api_annotate(self, articles, get_berts=True):
if article.get('skip_annotation'):
annotations.append([])
else:
annotations.append(self.annotate({"title": article['parsed_ti'], "abstract": article['parsed_ab']}, get_berts=get_berts))
annotations.append(self.annotate({"title": article['parsed_ti'], "abstract": article['parsed_ab']}, get_berts=get_berts, get_meshes=True))
return annotations


Expand All @@ -108,7 +110,7 @@ def pdf_annotate(self, data):
return data


def annotate(self, article, get_berts=True):
def annotate(self, article, get_berts=True, get_meshes=True):

"""
Annotate abstract of clinical trial report
Expand Down Expand Up @@ -160,6 +162,10 @@ def annotate(self, article, get_berts=True):
else:
out[bert_out_key] = [r.tolist() for r in self.bert.encode(bert_q)]

if get_meshes:
abbrev_dict = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=article['abstract'].text)
for k in ['population', 'interventions', 'outcomes']:
out[f"{k}_mesh"] = minimap.get_unique_terms(out[k], abbrevs=abbrev_dict)

return out

Expand Down
204 changes: 204 additions & 0 deletions robotreviewer/textprocessing/minimap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#
# minimap
#

import spacy
from spacy.tokens import Doc
from itertools import chain
import os
import robotreviewer
import pickle

nlp = spacy.load("en")

# ignore list
with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'ignorelist.txt'), 'r') as f:
ignores = set((l.strip() for l in f))


with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'str_to_cui.pck'), 'rb') as f:
str_to_cui = pickle.load(f)


with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'cui_to_mh.pck'), 'rb') as f:
cui_to_mh = pickle.load(f)

# some extra filtering rules to improve precision

drop_terms = set()

for k, v in str_to_cui.items():
# strings which are too ambiguous (too many CUIs... 15 from experimentation)
if len(set(v))>15:
drop_terms.add(k)


for k, v in str_to_cui.items():
# strings which are too short to be informative (2 chars or less tends to generate nonsense CUIs)
if len(k)<=2:
drop_terms.add(k)

for t in drop_terms:
str_to_cui.pop(t)


# regular expressions and text processing functions

import re

with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap','prepositions_conjunctions.txt'), 'r') as f:
prep_conj = [l.strip() for l in f]

prep_conj_re = re.compile(r'\b({})\b'.format('|'.join(prep_conj)))
nos_ignore = re.compile(r'\bNOS\b') # note do after lowercase
pos_ignore = re.compile(r"(?<=\w)(\'s?)\b")
left_paren = re.compile(r"^\[(X|V|D|M|EDTA|SO|Q)\]")
paren = re.compile(r"[\(\[]\w+[\)\]]")
strip_space = re.compile(r"\s+")

def remove_nos(text):
return nos_ignore.sub(' ', text)

def remove_pos(text):
return pos_ignore.sub('', text)

def syn_uninv(text):
try:
inversion_point = text.index(', ')
except ValueError:
# not found
return text

if inversion_point+2 == len(text):
# i.e. if the ', ' is at the end of the string
return text

if prep_conj_re.search(text[inversion_point+2:]):
return text
else:
return text[inversion_point+2:] + " " + text[:inversion_point]

def ne_parentheticals(text_str):
text_str = left_paren.sub('', text_str)
text_str = paren.sub('', text_str)
return text_str

def get_lemma(t):
if t.text in exceptions:
return exceptions[t.text]
else:
return t.lemma_

# pipelines

def minimap(text_str, chunks=False, abbrevs=None):
return matcher(pipeline(text_str, umls_mode=False, abbrevs=abbrevs), chunks=chunks)


def pipeline(text_str, umls_mode=True, abbrevs=None):

# sub out abbreviations if abbreviation dict given
if abbrevs:
for abbrev, expansion in abbrevs.items():
try:
text_str = re.sub(r"\b" + re.escape(abbrev) + r"\b", expansion, text_str)

except:
print(f"Regex error caused for one abstract! (for text string '{text_str}')")
print(f"and abbreviation dictionary '{abbrevs}'")
# to avoid weird errors in abbreviations generating error causing regex strings (which are not causing a named exception)
continue

# 1. removal of parentheticals
# if umls_mode:
text_str = ne_parentheticals(text_str)

# hyphens to spaces
text_str = text_str.replace('-', ' ')
# 3. conversion to lowercase
# text_str = text_str.lower()
# 2. syntactic uninverstion
if umls_mode:
text_str = syn_uninv(text_str)
# 4. stripping of possessives
text_str = remove_pos(text_str)
# strip NOS's
if umls_mode:
text_str = remove_nos(text_str)
# last... remove any multiple spaces, or starting/ending with space
text_str = strip_space.sub(' ', text_str)
text_str = text_str.strip()
return text_str



from itertools import chain


def matcher(text, chunks=False):
doc = nlp(text.lower())

if chunks:
return list(chain.from_iterable(matcher(np.text, chunks=False) for np in doc.noun_chunks))
tokens = [t.text.lower() for t in doc]
lemmas = [t.lemma_ for t in doc if t.text.lower()]
lemmas = [l for l in lemmas if l != '-PRON-']


matches = []
max_len = len(doc)
window = max_len


while window:

for i in range(max_len - window + 1):
window_text = ' '.join(tokens[i:i+window])
window_lemma = ' '.join(lemmas[i:i+window])


if window_lemma and window_lemma in str_to_cui and window_lemma not in ignores and window_text \
not in nlp.Defaults.stop_words:


for entry in str_to_cui[window_lemma]:
mh = cui_to_mh[entry].copy()
mh['start_idx'] = i
mh['end_idx'] = i+window
mh['source_text'] = doc[mh['start_idx']:mh['end_idx']].text
matches.append(mh)

window -= 1



matches.sort(key=lambda x: (x['start_idx'], -x['end_idx']))



filtered_terms = []

right_border = 0
for match in matches:
if match['start_idx'] >= right_border:
filtered_terms.append(match)
right_border = match['end_idx']

return filtered_terms


def get_unique_terms(l, abbrevs=None):

terms = [minimap(s, abbrevs=abbrevs) for s in l]
flat_terms = [item for sublist in terms for item in sublist]
encountered_terms = set()
unique_terms = []
for term in flat_terms:
if term['cui'] not in encountered_terms:
term.pop('start_idx')
term.pop('end_idx')
term.pop('source_text')
unique_terms.append(term)
encountered_terms.add(term['cui'])
return unique_terms

4 changes: 2 additions & 2 deletions robotreviewer_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ dependencies:
- celery==4.2.1
- flask-wtf==0.14.2
- fuzzywuzzy==0.17.0
# - keras==2.1.5
- keras==2.3.0
- keras==2.1.5
# - keras==2.3.0
- pyyaml==3.13
- wtforms==2.2.1
- h5py==2.8.0
Expand Down
4 changes: 4 additions & 0 deletions start-beta.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

MODEL_PATH="$(pwd)/robotreviewer/data"
docker run --name "robotreviewer-beta" --volume ${MODEL_PATH}:/var/lib/deploy/robotreviewer/data --env ROBOTREVIEWER_REST_API=true -d --restart="always" -p 127.0.0.1:5055:5000 robotreviewer-beta

0 comments on commit ac9436f

Please sign in to comment.