From ac9436f33362648e64a5d9fe0a32e0bcece74947 Mon Sep 17 00:00:00 2001
From: Iain Marshall <mail@ijmarshall.com>
Date: Tue, 7 Jan 2020 16:31:46 +0000
Subject: [PATCH] first working version of pico mesh tags

---
 deploy-beta.sh                                |   4 +
 robotreviewer/data/minimap/cui_to_mh.pck      |   3 +
 robotreviewer/data/minimap/ignorelist.txt     |   5 +
 .../minimap/prepositions_conjunctions.txt     |  98 +++++++++
 robotreviewer/data/minimap/str_to_cui.pck     |   3 +
 robotreviewer/data/minimap/subtrees.pck       |   3 +
 robotreviewer/robots/pico_span_robot.py       |  12 +-
 robotreviewer/textprocessing/minimap.py       | 204 ++++++++++++++++++
 robotreviewer_env.yml                         |   4 +-
 start-beta.sh                                 |   4 +
 10 files changed, 335 insertions(+), 5 deletions(-)
 create mode 100755 deploy-beta.sh
 create mode 100644 robotreviewer/data/minimap/cui_to_mh.pck
 create mode 100644 robotreviewer/data/minimap/ignorelist.txt
 create mode 100644 robotreviewer/data/minimap/prepositions_conjunctions.txt
 create mode 100644 robotreviewer/data/minimap/str_to_cui.pck
 create mode 100644 robotreviewer/data/minimap/subtrees.pck
 create mode 100644 robotreviewer/textprocessing/minimap.py
 create mode 100755 start-beta.sh

diff --git a/deploy-beta.sh b/deploy-beta.sh
new file mode 100755
index 0000000..801f627
--- /dev/null
+++ b/deploy-beta.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+echo "building image"
+docker build -t robotreviewer-beta .
diff --git a/robotreviewer/data/minimap/cui_to_mh.pck b/robotreviewer/data/minimap/cui_to_mh.pck
new file mode 100644
index 0000000..0071306
--- /dev/null
+++ b/robotreviewer/data/minimap/cui_to_mh.pck
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35011c62d475be4814f824edcedfc3999c1add6eec34f3f669451c3d9fc0ed7c
+size 33833302
diff --git a/robotreviewer/data/minimap/ignorelist.txt b/robotreviewer/data/minimap/ignorelist.txt
new file mode 100644
index 0000000..36efb33
--- /dev/null
+++ b/robotreviewer/data/minimap/ignorelist.txt
@@ -0,0 +1,5 @@
+we
+a
+suffer
+suffering
+power
\ No newline at end of file
diff --git a/robotreviewer/data/minimap/prepositions_conjunctions.txt b/robotreviewer/data/minimap/prepositions_conjunctions.txt
new file mode 100644
index 0000000..f7817eb
--- /dev/null
+++ b/robotreviewer/data/minimap/prepositions_conjunctions.txt
@@ -0,0 +1,98 @@
+aboard
+about
+above
+across
+after
+against
+along
+alongside
+amid
+among
+amongst
+apropos
+around
+as
+astride
+at
+bar
+before
+behind
+below
+beneath
+beside
+besides
+between
+beyond
+but
+by
+circa
+come
+despite
+down
+during
+except
+for
+from
+in
+inside
+into
+less
+like
+minus
+near
+nearer
+nearest
+notwithstanding
+of
+off
+on
+onto
+opposite
+out
+outside
+over
+past
+per
+plus
+post
+pre
+pro
+re
+sans
+save
+short
+since
+than
+through
+throughout
+till
+to
+toward
+towards
+under
+underneath
+unlike
+until
+unto
+up
+upon
+upside
+versus
+vs
+v
+via
+with
+within
+without
+worth
+and
+but
+because
+although
+or
+provided that
+as long as
+in order that
+in order to
+nor
+but also
\ No newline at end of file
diff --git a/robotreviewer/data/minimap/str_to_cui.pck b/robotreviewer/data/minimap/str_to_cui.pck
new file mode 100644
index 0000000..6dcfc7f
--- /dev/null
+++ b/robotreviewer/data/minimap/str_to_cui.pck
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b85990f7bda4b12e6dcb4dbea5668c8c7457cc43232867609c45b3b8ab97810e
+size 47499818
diff --git a/robotreviewer/data/minimap/subtrees.pck b/robotreviewer/data/minimap/subtrees.pck
new file mode 100644
index 0000000..9594eaf
--- /dev/null
+++ b/robotreviewer/data/minimap/subtrees.pck
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bc67848a03e193f45f417750d27f5d3daf25afb4a419af5f99499a85cb364d0
+size 1866171
diff --git a/robotreviewer/robots/pico_span_robot.py b/robotreviewer/robots/pico_span_robot.py
index 726003c..a9ec93d 100644
--- a/robotreviewer/robots/pico_span_robot.py
+++ b/robotreviewer/robots/pico_span_robot.py
@@ -29,6 +29,8 @@
 from robotreviewer.textprocessing import tokenizer
 from bert_serving.client import BertClient
 
+from robotreviewer.textprocessing import minimap
+from robotreviewer.textprocessing import schwartz_hearst
 log = logging.getLogger(__name__)
 
 from celery.contrib import rdb
@@ -74,7 +76,7 @@ def __init__(self):
         self.bert = BertClient()
 
 
-    def api_annotate(self, articles, get_berts=True):
+    def api_annotate(self, articles, get_berts=True, get_meshes=True):
 
         if not (all(('parsed_ab' in article for article in articles)) and all(('parsed_ti' in article for article in articles))):
             raise Exception('PICO span model requires a title and abstract to be able to complete annotation')
@@ -83,7 +85,7 @@ def api_annotate(self, articles, get_berts=True):
             if article.get('skip_annotation'):
                 annotations.append([])
             else:
-                annotations.append(self.annotate({"title": article['parsed_ti'], "abstract": article['parsed_ab']}, get_berts=get_berts))
+                annotations.append(self.annotate({"title": article['parsed_ti'], "abstract": article['parsed_ab']}, get_berts=get_berts, get_meshes=True))
         return annotations
 
 
@@ -108,7 +110,7 @@ def pdf_annotate(self, data):
         return data
 
 
-    def annotate(self, article, get_berts=True):
+    def annotate(self, article, get_berts=True, get_meshes=True):
 
         """
         Annotate abstract of clinical trial report
@@ -160,6 +162,10 @@ def annotate(self, article, get_berts=True):
                 else:
                     out[bert_out_key] = [r.tolist() for r in self.bert.encode(bert_q)]
 
+        if get_meshes:
+            abbrev_dict = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=article['abstract'].text)
+            for k in ['population', 'interventions', 'outcomes']:
+                out[f"{k}_mesh"] = minimap.get_unique_terms(out[k], abbrevs=abbrev_dict)
 
         return out
 
diff --git a/robotreviewer/textprocessing/minimap.py b/robotreviewer/textprocessing/minimap.py
new file mode 100644
index 0000000..7a4f7d5
--- /dev/null
+++ b/robotreviewer/textprocessing/minimap.py
@@ -0,0 +1,204 @@
+#
+# minimap
+#
+
+import spacy
+from spacy.tokens import Doc
+from itertools import chain
+import os
+import robotreviewer
+import pickle
+
+nlp = spacy.load("en")
+
+# ignore list
+with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'ignorelist.txt'), 'r') as f:
+    ignores = set((l.strip() for l in f))
+
+
+with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'str_to_cui.pck'), 'rb') as f:
+    str_to_cui = pickle.load(f)
+
+
+with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap', 'cui_to_mh.pck'), 'rb') as f:
+    cui_to_mh = pickle.load(f)
+
+# some extra filtering rules to improve precision
+
+drop_terms = set()
+
+for k, v in str_to_cui.items():
+    # strings which are too ambiguous (too many CUIs... 15 from experimentation)
+    if len(set(v))>15:
+        drop_terms.add(k)
+
+
+for k, v in str_to_cui.items():
+    # strings which are too short to be informative (2 chars or less tends to generate nonsense CUIs)
+    if len(k)<=2:
+        drop_terms.add(k)
+
+for t in drop_terms:
+    str_to_cui.pop(t)
+
+
+# regular expressions and text processing functions
+
+import re
+
+with open(os.path.join(robotreviewer.DATA_ROOT, 'minimap','prepositions_conjunctions.txt'), 'r') as f:
+    prep_conj = [l.strip() for l in f]
+
+prep_conj_re = re.compile(r'\b({})\b'.format('|'.join(prep_conj)))
+nos_ignore = re.compile(r'\bNOS\b') # note do after lowercase
+pos_ignore = re.compile(r"(?<=\w)(\'s?)\b")
+left_paren = re.compile(r"^\[(X|V|D|M|EDTA|SO|Q)\]")
+paren = re.compile(r"[\(\[]\w+[\)\]]")
+strip_space = re.compile(r"\s+")
+
+def remove_nos(text):
+    return nos_ignore.sub(' ', text)
+
+def remove_pos(text):
+    return pos_ignore.sub('', text)
+
+def syn_uninv(text):
+    try:
+        inversion_point = text.index(', ')
+    except ValueError:
+        # not found
+        return text
+
+    if inversion_point+2 == len(text):
+        # i.e. if the ', ' is at the end of the string
+        return text
+
+    if prep_conj_re.search(text[inversion_point+2:]):
+        return text
+    else:
+        return text[inversion_point+2:] + " " + text[:inversion_point]
+
+def ne_parentheticals(text_str):
+    text_str = left_paren.sub('', text_str)
+    text_str = paren.sub('', text_str)
+    return text_str
+
+def get_lemma(t):
+    if t.text in exceptions:
+        return exceptions[t.text]
+    else:
+        return t.lemma_
+
+# pipelines
+
+def minimap(text_str, chunks=False, abbrevs=None):
+    return matcher(pipeline(text_str, umls_mode=False, abbrevs=abbrevs), chunks=chunks)
+
+
+def pipeline(text_str, umls_mode=True, abbrevs=None):
+
+    # sub out abbreviations if abbreviation dict given
+    if abbrevs:
+        for abbrev, expansion in abbrevs.items():
+            try:
+                text_str = re.sub(r"\b" + re.escape(abbrev) + r"\b", expansion, text_str)
+
+            except:
+                print(f"Regex error caused for one abstract! (for text string '{text_str}')")
+                print(f"and abbreviation dictionary '{abbrevs}'")
+                # to avoid weird errors in abbreviations generating error causing regex strings (which are not causing a named exception)
+                continue
+
+    # 1. removal of parentheticals
+    #     if umls_mode:
+    text_str = ne_parentheticals(text_str)
+
+    # hyphens to spaces
+    text_str = text_str.replace('-', ' ')
+    # 3. conversion to lowercase
+    # text_str = text_str.lower()
+    # 2. syntactic uninverstion
+    if umls_mode:
+        text_str = syn_uninv(text_str)
+    # 4. stripping of possessives
+    text_str = remove_pos(text_str)
+    # strip NOS's
+    if umls_mode:
+        text_str = remove_nos(text_str)
+    # last... remove any multiple spaces, or starting/ending with space
+    text_str = strip_space.sub(' ', text_str)
+    text_str = text_str.strip()
+    return text_str
+
+
+
+from itertools import chain
+
+
+def matcher(text, chunks=False):
+    doc = nlp(text.lower())
+
+    if chunks:
+        return list(chain.from_iterable(matcher(np.text, chunks=False) for np in doc.noun_chunks))
+    tokens = [t.text.lower() for t in doc]
+    lemmas = [t.lemma_ for t in doc if t.text.lower()]
+    lemmas = [l for l in lemmas if l != '-PRON-']
+
+
+    matches = []
+    max_len = len(doc)
+    window = max_len
+
+
+    while window:
+
+        for i in range(max_len - window + 1):
+            window_text = ' '.join(tokens[i:i+window])
+            window_lemma = ' '.join(lemmas[i:i+window])
+
+
+            if window_lemma and window_lemma in str_to_cui and window_lemma not in ignores and window_text \
+                not in nlp.Defaults.stop_words:
+
+
+                for entry in str_to_cui[window_lemma]:
+                    mh = cui_to_mh[entry].copy()
+                    mh['start_idx'] = i
+                    mh['end_idx'] = i+window
+                    mh['source_text'] = doc[mh['start_idx']:mh['end_idx']].text
+                    matches.append(mh)
+
+        window -= 1
+
+
+
+    matches.sort(key=lambda x: (x['start_idx'], -x['end_idx']))
+
+
+
+    filtered_terms = []
+
+    right_border = 0
+    for match in matches:
+        if match['start_idx'] >= right_border:
+            filtered_terms.append(match)
+            right_border = match['end_idx']
+
+    return filtered_terms
+
+
+def get_unique_terms(l, abbrevs=None):
+    
+    terms = [minimap(s, abbrevs=abbrevs) for s in l]
+    flat_terms = [item for sublist in terms for item in sublist]
+    encountered_terms = set()
+    unique_terms = []
+    for term in flat_terms:
+        if term['cui'] not in encountered_terms:
+            term.pop('start_idx')
+            term.pop('end_idx')
+            term.pop('source_text')
+            unique_terms.append(term)
+            encountered_terms.add(term['cui'])
+    return unique_terms
+    
diff --git a/robotreviewer_env.yml b/robotreviewer_env.yml
index 0075a46..ded8102 100644
--- a/robotreviewer_env.yml
+++ b/robotreviewer_env.yml
@@ -26,8 +26,8 @@ dependencies:
     - celery==4.2.1
     - flask-wtf==0.14.2
     - fuzzywuzzy==0.17.0
-    # - keras==2.1.5
-    - keras==2.3.0
+    - keras==2.1.5
+    # - keras==2.3.0
     - pyyaml==3.13
     - wtforms==2.2.1
     - h5py==2.8.0
diff --git a/start-beta.sh b/start-beta.sh
new file mode 100755
index 0000000..02f8368
--- /dev/null
+++ b/start-beta.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+MODEL_PATH="$(pwd)/robotreviewer/data"
+docker run --name "robotreviewer-beta" --volume ${MODEL_PATH}:/var/lib/deploy/robotreviewer/data  --env ROBOTREVIEWER_REST_API=true -d --restart="always" -p 127.0.0.1:5055:5000 robotreviewer-beta