gather all output into one dict

PlusLabNLP · Jul 2, 2020 · c6e4d65 · c6e4d65
1 parent 6f3119e
commit c6e4d65
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 19 deletions.
diff --git a/predict.py b/predict.py
@@ -217,7 +217,7 @@ def create_json_output(tmp_file_dir, doc_id):
     def same_event(event1, event2):
         return event1['triggers'] == event2['triggers'] and event1['event_type'] == event2['event_type'] and event1['arguments'] == event2['arguments']
 
-    res = []
+    res = [{}]
     with open(f'{tmp_file_dir}/{doc_id}.a1','r') as f:
         lines = [re.split('\s', line.strip()) for line in f.readlines()]
     try:
@@ -236,20 +236,23 @@ def same_event(event1, event2):
 
     tokens = preprocess_result['tokens']
     ner = preprocess_result['ner']
-    char2senttoken_map = preprocess_result['char2senttoken_map']
+    char2doctoken_map = preprocess_result['char2doctoken_map']
     sentence_offsets = preprocess_result['sentence_offsets']
 
     # create a list of character offset for each first token in each sentence to determine which sentence the event belongs to
 
 
     # constuct output
-    for tok, ne in zip(tokens, ner):
-        cur = {
-            'tokens':tok,
-            'events':[],
-            'ner':ne
-        }
-        res.append(cur)
+    # for tok, ne in zip(tokens, ner):
+    #     cur = {
+    #         'tokens':tok,
+    #         'events':[],
+    #         'ner':ne
+    #     }
+    #     res.append(cur)
+    res[0]['tokens'] = tokens
+    res[0]['events'] = []
+    res[0]['ner'] = ner
 
     # constrcut event
     for line in lines:
@@ -272,15 +275,15 @@ def same_event(event1, event2):
             trigger_start_char_offset = entity_map[entity_id][0]
 
             # find which sentence this event belongs to.
-            sentence_idx = bisect_right(sentence_offsets, trigger_start_char_offset) - 1
+            sentence_idx = 0
 
             current_event['event_type'] = event_type
             current_event['triggers'] = [{'event_type':event_type,
                          'text': entity_map[entity_id][2],
-                         'start_token': char2senttoken_map[list(char2senttoken_map.keys())[bisect_right(list(char2senttoken_map.keys()), entity_map[entity_id][0]) - 1]],
+                         'start_token': char2doctoken_map[list(char2doctoken_map.keys())[bisect_right(list(char2doctoken_map.keys()), entity_map[entity_id][0]) - 1]],
 
                          # the first -1 is for go back one character because GENIA annotation at the character level is exlusive at the end i.e. [start, end)
-                         'end_token': char2senttoken_map[list(char2senttoken_map.keys())[bisect_right(list(char2senttoken_map.keys()), entity_map[entity_id][1] -1) -1]]
+                         'end_token': char2doctoken_map[list(char2doctoken_map.keys())[bisect_right(list(char2doctoken_map.keys()), entity_map[entity_id][1] -1) -1]]
                           }]
 
             current_event['arguments'] = []
@@ -291,9 +294,9 @@ def same_event(event1, event2):
                 current_argumet = {
                     'role':role,
                     'text': entity_map[entity_id][2],
-                    'start_token': char2senttoken_map[list(char2senttoken_map.keys())[bisect_right(list(char2senttoken_map.keys()), entity_map[entity_id][0]) - 1]],
+                    'start_token': char2doctoken_map[list(char2doctoken_map.keys())[bisect_right(list(char2doctoken_map.keys()), entity_map[entity_id][0]) - 1]],
                     # the first -1 is for go back one character because GENIA annotation at the character level is exlusive at the end i.e. [start, end)
-                    'end_token': char2senttoken_map[list(char2senttoken_map.keys())[bisect_right(list(char2senttoken_map.keys()), entity_map[entity_id][1] -1) -1]]
+                    'end_token': char2doctoken_map[list(char2doctoken_map.keys())[bisect_right(list(char2doctoken_map.keys()), entity_map[entity_id][1] -1) -1]]
                 }
                 current_event['arguments'].append(current_argumet)
 
@@ -383,4 +386,4 @@ def biomedical_evet_extraction(user_input):
 
 ##
 # print(torch.cuda.is_available())
-biomedical_evet_extraction("This region is termed the CK-1 or CD28RE and appears to bind specific members of the NF-kappa B family of transcription factors. Human T leukemia virus type 1 (HTLV-1) infects T cells and can lead to increase GM-CSF expression.")
+biomedical_evet_extraction("The B cells were found to express BMP type I and type II receptors and BMP-6 rapidly induced phosphorylation of Smad1/5/8.")
diff --git a/utils.py b/utils.py
@@ -236,18 +236,18 @@ def process_document(document, doc_id, output_dir, corpus_proteinOrigIdBySpan):
     document_entities = filter_protein_entities(document.ents, document)
 
     # a list of list of tokens
-    tokens = [[str(tok) for tok in sent] for sent in document.sents]
+    tokens = [str(tok) for tok in document]
 
     sentence_offsets = [sent[0].idx for sent in document.sents]
     # token index 2 entity map
     tokidx2ent_map = {token_idx: token.ent_type_ for token_idx, token in enumerate(document)}
 
     # create character offset to sentence-token index map
-    char2senttoken_map = {tok.idx:sent_tok_id for sent in document.sents for sent_tok_id, tok in enumerate(sent)}
+    char2doctoken_map = {tok.idx:doc_tok_id for doc_tok_id, tok in enumerate(document)}
 
     # ner for each sentence.
     # -sent.start to convert to sentence-level anntation
-    ner = [[[ent.start - sent.start, ent.end-1-sent.start, tokidx2ent_map[ent.start]] for ent in sent.ents] for sent in document.sents]
+    ner = [[ent.start , ent.end-1, tokidx2ent_map[ent.start]] for ent in document.ents]
 
     # create mapping from starting character position to entity id
     ent_char2_id_map = {ent.start_char:f'T{idx+1}' for idx, ent in enumerate(document_entities)}
@@ -262,7 +262,7 @@ def process_document(document, doc_id, output_dir, corpus_proteinOrigIdBySpan):
     preprocess_result = {
         'ner':ner,
         'tokens':tokens,
-        'char2senttoken_map':char2senttoken_map,
+        'char2doctoken_map':char2doctoken_map,
         'sentence_offsets':sentence_offsets
     }