From ac9436f33362648e64a5d9fe0a32e0bcece74947 Mon Sep 17 00:00:00 2001 From: Iain Marshall Date: Tue, 7 Jan 2020 16:31:46 +0000 Subject: [PATCH] first working version of pico mesh tags --- deploy-beta.sh | 4 + robotreviewer/data/minimap/cui_to_mh.pck | 3 + robotreviewer/data/minimap/ignorelist.txt | 5 + .../minimap/prepositions_conjunctions.txt | 98 +++++++++ robotreviewer/data/minimap/str_to_cui.pck | 3 + robotreviewer/data/minimap/subtrees.pck | 3 + robotreviewer/robots/pico_span_robot.py | 12 +- robotreviewer/textprocessing/minimap.py | 204 ++++++++++++++++++ robotreviewer_env.yml | 4 +- start-beta.sh | 4 + 10 files changed, 335 insertions(+), 5 deletions(-) create mode 100755 deploy-beta.sh create mode 100644 robotreviewer/data/minimap/cui_to_mh.pck create mode 100644 robotreviewer/data/minimap/ignorelist.txt create mode 100644 robotreviewer/data/minimap/prepositions_conjunctions.txt create mode 100644 robotreviewer/data/minimap/str_to_cui.pck create mode 100644 robotreviewer/data/minimap/subtrees.pck create mode 100644 robotreviewer/textprocessing/minimap.py create mode 100755 start-beta.sh diff --git a/deploy-beta.sh b/deploy-beta.sh new file mode 100755 index 0000000..801f627 --- /dev/null +++ b/deploy-beta.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo "building image" +docker build -t robotreviewer-beta . diff --git a/robotreviewer/data/minimap/cui_to_mh.pck b/robotreviewer/data/minimap/cui_to_mh.pck new file mode 100644 index 0000000..0071306 --- /dev/null +++ b/robotreviewer/data/minimap/cui_to_mh.pck @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35011c62d475be4814f824edcedfc3999c1add6eec34f3f669451c3d9fc0ed7c +size 33833302 diff --git a/robotreviewer/data/minimap/ignorelist.txt b/robotreviewer/data/minimap/ignorelist.txt new file mode 100644 index 0000000..36efb33 --- /dev/null +++ b/robotreviewer/data/minimap/ignorelist.txt @@ -0,0 +1,5 @@ +we +a +suffer +suffering +power \ No newline at end of file diff --git a/robotreviewer/data/minimap/prepositions_conjunctions.txt b/robotreviewer/data/minimap/prepositions_conjunctions.txt new file mode 100644 index 0000000..f7817eb --- /dev/null +++ b/robotreviewer/data/minimap/prepositions_conjunctions.txt @@ -0,0 +1,98 @@ +aboard +about +above +across +after +against +along +alongside +amid +among +amongst +apropos +around +as +astride +at +bar +before +behind +below +beneath +beside +besides +between +beyond +but +by +circa +come +despite +down +during +except +for +from +in +inside +into +less +like +minus +near +nearer +nearest +notwithstanding +of +off +on +onto +opposite +out +outside +over +past +per +plus +post +pre +pro +re +sans +save +short +since +than +through +throughout +till +to +toward +towards +under +underneath +unlike +until +unto +up +upon +upside +versus +vs +v +via +with +within +without +worth +and +but +because +although +or +provided that +as long as +in order that +in order to +nor +but also \ No newline at end of file diff --git a/robotreviewer/data/minimap/str_to_cui.pck b/robotreviewer/data/minimap/str_to_cui.pck new file mode 100644 index 0000000..6dcfc7f --- /dev/null +++ b/robotreviewer/data/minimap/str_to_cui.pck @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b85990f7bda4b12e6dcb4dbea5668c8c7457cc43232867609c45b3b8ab97810e +size 47499818 diff --git a/robotreviewer/data/minimap/subtrees.pck b/robotreviewer/data/minimap/subtrees.pck new file mode 100644 index 0000000..9594eaf --- /dev/null +++ b/robotreviewer/data/minimap/subtrees.pck @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc67848a03e193f45f417750d27f5d3daf25afb4a419af5f99499a85cb364d0 +size 1866171 diff --git a/robotreviewer/robots/pico_span_robot.py b/robotreviewer/robots/pico_span_robot.py index 726003c..a9ec93d 100644 --- a/robotreviewer/robots/pico_span_robot.py +++ b/robotreviewer/robots/pico_span_robot.py @@ -29,6 +29,8 @@ from robotreviewer.textprocessing import tokenizer from bert_serving.client import BertClient +from robotreviewer.textprocessing import minimap +from robotreviewer.textprocessing import schwartz_hearst log = logging.getLogger(__name__) from celery.contrib import rdb @@ -74,7 +76,7 @@ def __init__(self): self.bert = BertClient() - def api_annotate(self, articles, get_berts=True): + def api_annotate(self, articles, get_berts=True, get_meshes=True): if not (all(('parsed_ab' in article for article in articles)) and all(('parsed_ti' in article for article in articles))): raise Exception('PICO span model requires a title and abstract to be able to complete annotation') @@ -83,7 +85,7 @@ def api_annotate(self, articles, get_berts=True): if article.get('skip_annotation'): annotations.append([]) else: - annotations.append(self.annotate({"title": article['parsed_ti'], "abstract": article['parsed_ab']}, get_berts=get_berts)) + annotations.append(self.annotate({"title": article['parsed_ti'], "abstract": article['parsed_ab']}, get_berts=get_berts, get_meshes=True)) return annotations @@ -108,7 +110,7 @@ def pdf_annotate(self, data): return data - def annotate(self, article, get_berts=True): + def annotate(self, article, get_berts=True, get_meshes=True): """ Annotate abstract of clinical trial report @@ -160,6 +162,10 @@ def annotate(self, article, get_berts=True): else: out[bert_out_key] = [r.tolist() for r in self.bert.encode(bert_q)] + if get_meshes: + abbrev_dict = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=article['abstract'].text) + for k in ['population', 'interventions', 'outcomes']: + out[f"{k}_mesh"] = minimap.get_unique_terms(out[k], abbrevs=abbrev_dict) return out diff --git a/robotreviewer/textprocessing/minimap.py b/robotreviewer/textprocessing/minimap.py new file mode 100644 index 0000000..7a4f7d5 --- /dev/null +++ b/robotreviewer/textprocessing/minimap.py @@ -0,0 +1,204 @@ +# +# minimap +# + +import spacy +from spacy.tokens import Doc +from itertools import chain +import os +import robotreviewer +import pickle + +nlp = spacy.load("en") + +# ignore list +with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'ignorelist.txt'), 'r') as f: + ignores = set((l.strip() for l in f)) + + +with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'str_to_cui.pck'), 'rb') as f: + str_to_cui = pickle.load(f) + + +with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'cui_to_mh.pck'), 'rb') as f: + cui_to_mh = pickle.load(f) + +# some extra filtering rules to improve precision + +drop_terms = set() + +for k, v in str_to_cui.items(): + # strings which are too ambiguous (too many CUIs... 15 from experimentation) + if len(set(v))>15: + drop_terms.add(k) + + +for k, v in str_to_cui.items(): + # strings which are too short to be informative (2 chars or less tends to generate nonsense CUIs) + if len(k)<=2: + drop_terms.add(k) + +for t in drop_terms: + str_to_cui.pop(t) + + +# regular expressions and text processing functions + +import re + +with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap','prepositions_conjunctions.txt'), 'r') as f: + prep_conj = [l.strip() for l in f] + +prep_conj_re = re.compile(r'\b({})\b'.format('|'.join(prep_conj))) +nos_ignore = re.compile(r'\bNOS\b') # note do after lowercase +pos_ignore = re.compile(r"(?<=\w)(\'s?)\b") +left_paren = re.compile(r"^\[(X|V|D|M|EDTA|SO|Q)\]") +paren = re.compile(r"[\(\[]\w+[\)\]]") +strip_space = re.compile(r"\s+") + +def remove_nos(text): + return nos_ignore.sub(' ', text) + +def remove_pos(text): + return pos_ignore.sub('', text) + +def syn_uninv(text): + try: + inversion_point = text.index(', ') + except ValueError: + # not found + return text + + if inversion_point+2 == len(text): + # i.e. if the ', ' is at the end of the string + return text + + if prep_conj_re.search(text[inversion_point+2:]): + return text + else: + return text[inversion_point+2:] + " " + text[:inversion_point] + +def ne_parentheticals(text_str): + text_str = left_paren.sub('', text_str) + text_str = paren.sub('', text_str) + return text_str + +def get_lemma(t): + if t.text in exceptions: + return exceptions[t.text] + else: + return t.lemma_ + +# pipelines + +def minimap(text_str, chunks=False, abbrevs=None): + return matcher(pipeline(text_str, umls_mode=False, abbrevs=abbrevs), chunks=chunks) + + +def pipeline(text_str, umls_mode=True, abbrevs=None): + + # sub out abbreviations if abbreviation dict given + if abbrevs: + for abbrev, expansion in abbrevs.items(): + try: + text_str = re.sub(r"\b" + re.escape(abbrev) + r"\b", expansion, text_str) + + except: + print(f"Regex error caused for one abstract! (for text string '{text_str}')") + print(f"and abbreviation dictionary '{abbrevs}'") + # to avoid weird errors in abbreviations generating error causing regex strings (which are not causing a named exception) + continue + + # 1. removal of parentheticals + # if umls_mode: + text_str = ne_parentheticals(text_str) + + # hyphens to spaces + text_str = text_str.replace('-', ' ') + # 3. conversion to lowercase + # text_str = text_str.lower() + # 2. syntactic uninverstion + if umls_mode: + text_str = syn_uninv(text_str) + # 4. stripping of possessives + text_str = remove_pos(text_str) + # strip NOS's + if umls_mode: + text_str = remove_nos(text_str) + # last... remove any multiple spaces, or starting/ending with space + text_str = strip_space.sub(' ', text_str) + text_str = text_str.strip() + return text_str + + + +from itertools import chain + + +def matcher(text, chunks=False): + doc = nlp(text.lower()) + + if chunks: + return list(chain.from_iterable(matcher(np.text, chunks=False) for np in doc.noun_chunks)) + tokens = [t.text.lower() for t in doc] + lemmas = [t.lemma_ for t in doc if t.text.lower()] + lemmas = [l for l in lemmas if l != '-PRON-'] + + + matches = [] + max_len = len(doc) + window = max_len + + + while window: + + for i in range(max_len - window + 1): + window_text = ' '.join(tokens[i:i+window]) + window_lemma = ' '.join(lemmas[i:i+window]) + + + if window_lemma and window_lemma in str_to_cui and window_lemma not in ignores and window_text \ + not in nlp.Defaults.stop_words: + + + for entry in str_to_cui[window_lemma]: + mh = cui_to_mh[entry].copy() + mh['start_idx'] = i + mh['end_idx'] = i+window + mh['source_text'] = doc[mh['start_idx']:mh['end_idx']].text + matches.append(mh) + + window -= 1 + + + + matches.sort(key=lambda x: (x['start_idx'], -x['end_idx'])) + + + + filtered_terms = [] + + right_border = 0 + for match in matches: + if match['start_idx'] >= right_border: + filtered_terms.append(match) + right_border = match['end_idx'] + + return filtered_terms + + +def get_unique_terms(l, abbrevs=None): + + terms = [minimap(s, abbrevs=abbrevs) for s in l] + flat_terms = [item for sublist in terms for item in sublist] + encountered_terms = set() + unique_terms = [] + for term in flat_terms: + if term['cui'] not in encountered_terms: + term.pop('start_idx') + term.pop('end_idx') + term.pop('source_text') + unique_terms.append(term) + encountered_terms.add(term['cui']) + return unique_terms + diff --git a/robotreviewer_env.yml b/robotreviewer_env.yml index 0075a46..ded8102 100644 --- a/robotreviewer_env.yml +++ b/robotreviewer_env.yml @@ -26,8 +26,8 @@ dependencies: - celery==4.2.1 - flask-wtf==0.14.2 - fuzzywuzzy==0.17.0 - # - keras==2.1.5 - - keras==2.3.0 + - keras==2.1.5 + # - keras==2.3.0 - pyyaml==3.13 - wtforms==2.2.1 - h5py==2.8.0 diff --git a/start-beta.sh b/start-beta.sh new file mode 100755 index 0000000..02f8368 --- /dev/null +++ b/start-beta.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +MODEL_PATH="$(pwd)/robotreviewer/data" +docker run --name "robotreviewer-beta" --volume ${MODEL_PATH}:/var/lib/deploy/robotreviewer/data --env ROBOTREVIEWER_REST_API=true -d --restart="always" -p 127.0.0.1:5055:5000 robotreviewer-beta