use paragraphs instead of sentences

lfoppiano · Dec 18, 2023 · 5fd26bb · 5fd26bb
1 parent 9d4be7c
commit 5fd26bb
Show file tree

Hide file tree

Showing 9 changed files with 5,357 additions and 42 deletions.
diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
@@ -56,7 +56,7 @@ def __init__(self,
             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
-                coordinates=["s"],
+                coordinates=["p"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True

diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py
@@ -136,7 +136,7 @@ def process_structure(self, input_path, coordinates=False):
                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
-                                                                segment_sentences=True,
+                                                                segment_sentences=False,
                                                                 tei_coordinates=coordinates,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
@@ -188,7 +188,7 @@ def parse_grobid_xml(self, text, coordinates=False):
         #         "passage_id": "title0"
         #     })
 
-        passage_type = "sentence" if coordinates else "paragraph"
+        passage_type = "paragraph"
 
         if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
             passages.append({
@@ -201,42 +201,74 @@ def parse_grobid_xml(self, text, coordinates=False):
             })
 
         soup = BeautifulSoup(text, 'xml')
-        text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)
-
-        passages.extend([
-            {
-                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
-                                                  text.parent.name != "ref" or (
-                                                          text.parent.name == "ref" and text.parent.attrs[
-                                                      'type'] != 'bibr'))),
-                "type": passage_type,
-                "section": "<body>",
-                "subSection": "<sentence>",
-                "passage_id": str(paragraph_id) + str(sentence_id),
-                # "coordinates": sentence['coords'].split(";") if coordinates else []
-                "coordinates": sentence['coords'] if coordinates else ""
-            }
-            for paragraph_id, paragraph in enumerate(text_blocks_body) for
-            sentence_id, sentence in enumerate(paragraph)
-        ])
+        text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
+
+        use_paragraphs = True
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body)
+            ])
 
         text_blocks_figures = get_children_figures(soup, verbose=False)
 
-        passages.extend([
-            {
-                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
-                                                  text.parent.name != "ref" or (
-                                                          text.parent.name == "ref" and text.parent.attrs[
-                                                      'type'] != 'bibr'))),
-                "type": passage_type,
-                "section": "<body>",
-                "subSection": "<figure>",
-                "passage_id": str(paragraph_id) + str(sentence_id),
-                "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
-            }
-            for paragraph_id, paragraph in enumerate(text_blocks_figures) for
-            sentence_id, sentence in enumerate(paragraph)
-        ])
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id) + str(sentence_id),
+                    "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures)
+            ])
 
         return output_data
 
@@ -532,6 +564,21 @@ def extract_quantities(self, text):
     def extract_materials(self, text):
         return self.gmp.extract_materials(text)
 
+    @staticmethod
+    def box_to_dict(box, color=None, type=None):
+
+        if box is None or box == "" or len(box) < 5:
+            return {}
+
+        item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
+        if color is not None:
+            item['color'] = color
+
+        if type:
+            item['type'] = type
+
+        return item
+
     @staticmethod
     def prune_overlapping_annotations(entities: list) -> list:
         # Sorting by offsets
@@ -742,7 +789,8 @@ def get_children_body(soup: object, use_paragraphs: object = True, verbose: obje
     child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
+            children.extend(
+                [subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
 
     if verbose:
         print(str(children))
@@ -755,7 +803,8 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
     child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
+            children.extend(
+                [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
 
     if verbose:
         print(str(children))

diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,7 @@ grobid_tei_xml==0.1.3
 tqdm
 pyyaml==6.0
 pytest
-streamlit==1.27.2
+streamlit==1.29.0
 lxml
 Beautifulsoup4
 python-dotenv

diff --git a/streamlit_app.py b/streamlit_app.py
@@ -296,7 +296,7 @@ def play_old_messages():
     mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
                     help="LLM will respond the question, Embedding will show the "
                          "paragraphs relevant to the question in the paper.")
-    chunk_size = st.slider("Chunks size", -1, 2000, value=250,
+    chunk_size = st.slider("Chunks size", -1, 2000, value=-1,
                            help="Size of chunks in which the document is partitioned",
                            disabled=uploaded_file is not None)
     context_size = st.slider("Context size", 3, 10, value=4,
@@ -410,8 +410,9 @@ def generate_color_gradient(num_elements):
                                                                                               st.session_state.doc_id,
                                                                                               context_size=context_size)
                 annotations = [
-                    {"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "grey"} for coo in [c.split(",") for coord in
-                    coordinates for c in coord]]
+                    GrobidAggregationProcessor.box_to_dict(coo) for coo in [c.split(",") for coord in
+                    coordinates for c in coord]
+                ]
                 gradients = generate_color_gradient(len(annotations))
                 for i, color in enumerate(gradients):
                     annotations[i]['color'] = color

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,37 @@
+import logging
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+from _pytest._py.path import LocalPath
+
+# derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope='session', autouse=True)
+def setup_logging():
+    logging.root.handlers = []
+    logging.basicConfig(level='INFO')
+    logging.getLogger('tests').setLevel('DEBUG')
+    # logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')
+
+
+def _backport_assert_called(mock: MagicMock):
+    assert mock.called
+
+
+@pytest.fixture(scope='session', autouse=True)
+def patch_magicmock():
+    try:
+        MagicMock.assert_called
+    except AttributeError:
+        MagicMock.assert_called = _backport_assert_called
+
+
+@pytest.fixture
+def temp_dir(tmpdir: LocalPath):
+    # convert to standard Path
+    return Path(str(tmpdir))
+