Skip to content

Commit

Permalink
use paragraphs instead of sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Dec 18, 2023
1 parent 9d4be7c commit 5fd26bb
Show file tree
Hide file tree
Showing 9 changed files with 5,357 additions and 42 deletions.
2 changes: 1 addition & 1 deletion document_qa/document_qa_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self,
grobid_client = GrobidClient(
grobid_server=self.grobid_url,
batch_size=1000,
coordinates=["s"],
coordinates=["p"],
sleep_time=5,
timeout=60,
check_server=True
Expand Down
123 changes: 86 additions & 37 deletions document_qa/grobid_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def process_structure(self, input_path, coordinates=False):
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=True,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
Expand Down Expand Up @@ -188,7 +188,7 @@ def parse_grobid_xml(self, text, coordinates=False):
# "passage_id": "title0"
# })

passage_type = "sentence" if coordinates else "paragraph"
passage_type = "paragraph"

if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
passages.append({
Expand All @@ -201,42 +201,74 @@ def parse_grobid_xml(self, text, coordinates=False):
})

soup = BeautifulSoup(text, 'xml')
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)

passages.extend([
{
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "<body>",
"subSection": "<sentence>",
"passage_id": str(paragraph_id) + str(sentence_id),
# "coordinates": sentence['coords'].split(";") if coordinates else []
"coordinates": sentence['coords'] if coordinates else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_body) for
sentence_id, sentence in enumerate(paragraph)
])
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)

use_paragraphs = True
if not use_paragraphs:
passages.extend([
{
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "<body>",
"subSection": "<paragraph>",
"passage_id": str(paragraph_id),
"coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_body) for
sentence_id, sentence in enumerate(paragraph)
])
else:
passages.extend([
{
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "<body>",
"subSection": "<paragraph>",
"passage_id": str(paragraph_id),
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_body)
])

text_blocks_figures = get_children_figures(soup, verbose=False)

passages.extend([
{
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "<body>",
"subSection": "<figure>",
"passage_id": str(paragraph_id) + str(sentence_id),
"coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_figures) for
sentence_id, sentence in enumerate(paragraph)
])
if not use_paragraphs:
passages.extend([
{
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "<body>",
"subSection": "<figure>",
"passage_id": str(paragraph_id) + str(sentence_id),
"coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_figures) for
sentence_id, sentence in enumerate(paragraph)
])
else:
passages.extend([
{
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "<body>",
"subSection": "<figure>",
"passage_id": str(paragraph_id),
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_figures)
])

return output_data

Expand Down Expand Up @@ -532,6 +564,21 @@ def extract_quantities(self, text):
def extract_materials(self, text):
return self.gmp.extract_materials(text)

@staticmethod
def box_to_dict(box, color=None, type=None):

if box is None or box == "" or len(box) < 5:
return {}

item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
if color is not None:
item['color'] = color

if type:
item['type'] = type

return item

@staticmethod
def prune_overlapping_annotations(entities: list) -> list:
# Sorting by offsets
Expand Down Expand Up @@ -742,7 +789,8 @@ def get_children_body(soup: object, use_paragraphs: object = True, verbose: obje
child_name = "p" if use_paragraphs else "s"
for child in soup.TEI.children:
if child.name == 'text':
children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
children.extend(
[subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])

if verbose:
print(str(children))
Expand All @@ -755,7 +803,8 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
child_name = "p" if use_paragraphs else "s"
for child in soup.TEI.children:
if child.name == 'text':
children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
children.extend(
[subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])

if verbose:
print(str(children))
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ grobid_tei_xml==0.1.3
tqdm
pyyaml==6.0
pytest
streamlit==1.27.2
streamlit==1.29.0
lxml
Beautifulsoup4
python-dotenv
Expand Down
7 changes: 4 additions & 3 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def play_old_messages():
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
help="LLM will respond the question, Embedding will show the "
"paragraphs relevant to the question in the paper.")
chunk_size = st.slider("Chunks size", -1, 2000, value=250,
chunk_size = st.slider("Chunks size", -1, 2000, value=-1,
help="Size of chunks in which the document is partitioned",
disabled=uploaded_file is not None)
context_size = st.slider("Context size", 3, 10, value=4,
Expand Down Expand Up @@ -410,8 +410,9 @@ def generate_color_gradient(num_elements):
st.session_state.doc_id,
context_size=context_size)
annotations = [
{"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "grey"} for coo in [c.split(",") for coord in
coordinates for c in coord]]
GrobidAggregationProcessor.box_to_dict(coo) for coo in [c.split(",") for coord in
coordinates for c in coord]
]
gradients = generate_color_gradient(len(annotations))
for i, color in enumerate(gradients):
annotations[i]['color'] = color
Expand Down
Empty file added tests/__init__.py
Empty file.
37 changes: 37 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import logging
from pathlib import Path
from unittest.mock import MagicMock

import pytest
from _pytest._py.path import LocalPath

# derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests

LOGGER = logging.getLogger(__name__)


@pytest.fixture(scope='session', autouse=True)
def setup_logging():
logging.root.handlers = []
logging.basicConfig(level='INFO')
logging.getLogger('tests').setLevel('DEBUG')
# logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')


def _backport_assert_called(mock: MagicMock):
assert mock.called


@pytest.fixture(scope='session', autouse=True)
def patch_magicmock():
try:
MagicMock.assert_called
except AttributeError:
MagicMock.assert_called = _backport_assert_called


@pytest.fixture
def temp_dir(tmpdir: LocalPath):
# convert to standard Path
return Path(str(tmpdir))

Loading

0 comments on commit 5fd26bb

Please sign in to comment.