Skip to content

Commit

Permalink
Fixed an issue with out-of-order headers and added test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
kiran-nlmatics authored and ansukla committed Oct 21, 2023
1 parent 6f571c4 commit 8846995
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 3 deletions.
2 changes: 1 addition & 1 deletion llmsherpa/readers/file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ def main():
# Using the special variable
# __name__
if __name__=="__main__":
main()
main()
8 changes: 6 additions & 2 deletions llmsherpa/readers/layout_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@ def __init__(self, block_json=None):
self.sentences = block_json['sentences'] if block_json and 'sentences' in block_json else []
self.children = []
self.parent = None

def add_child(self, node):
self.children.append(node)
node.parent = self

def to_html(self, include_children=False, recurse=False):
pass

def to_text(self, include_children=False, recurse=False):
pass

def parent_chain(self):
chain = []
parent = self.parent
Expand Down Expand Up @@ -306,7 +310,7 @@ def read(self, blocks_json):
parent_stack.append(node)
parent.add_child(node)
else:
while len(parent_stack) > 0 and parent_stack.pop().level > node.level:
while len(parent_stack) > 1 and parent_stack.pop().level > node.level:
pass
parent_stack[-1].add_child(node)
parent_stack.append(node)
Expand All @@ -325,4 +329,4 @@ def chunks(self):
def tables(self):
return self.root_node.tables()
def sections(self):
return self.root_node.sections()
return self.root_node.sections()
182 changes: 182 additions & 0 deletions llmsherpa/readers/tests/ooo_header_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
[
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"OutOfOrder Header 1"
],
"block_idx": 0,
"level": 1
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"OutOfOrder Header 2"
],
"block_idx": 0,
"level": 1
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"Article I"
],
"block_idx": 0,
"level": 0
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"Section 1"
],
"block_idx": 0,
"level": 1
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"1.1 One point one"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"1.2 One point two"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"Section 2"
],
"block_idx": 0,
"level": 1
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"2.1 Two point one"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"2.2 Two point two"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"Article II"
],
"block_idx": 0,
"level": 0
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"Section 1"
],
"block_idx": 0,
"level": 1
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"1.1 One point one"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"1.2 One point two"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"1.2.1 One point two point one"
],
"block_idx": 0,
"level": 3
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"1.2.2 One point two point two"
],
"block_idx": 0,
"level": 3
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"Section 2"
],
"block_idx": 0,
"level": 1
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"2.1 Two point one"
],
"block_idx": 0,
"level": 2
},
{
"tag": "header",
"page_idx": 0,
"block_class": "cls_0",
"sentences": [
"2.2 Two point two"
],
"block_idx": 0,
"level": 2
}
]
10 changes: 10 additions & 0 deletions llmsherpa/readers/tests/test_layout_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ def test_nested_headers(self):
self.assertEqual(len(doc.children[0].children[0].children), 2)
self.assertEqual(len(doc.children[1].children[0].children[1].children), 2)
self.assertEqual(doc.children[1].children[0].children[1].parent_text(), "Article II > Section 1")

def test_ooo_nested_headers(self):
# OutOfOrder Header test case
doc = self.read_layout("ooo_header_test.json")
self.assertEqual(len(doc.children[0].children), 0)
self.assertEqual(len(doc.children[1].children), 0)
self.assertEqual(len(doc.children[2].children), 2)
self.assertEqual(len(doc.children[2].children[0].children), 2)
self.assertEqual(len(doc.children[3].children[0].children[1].children), 2)
self.assertEqual(doc.children[3].children[0].children[1].parent_text(), "Article II > Section 1")

def test_table(self):
doc = self.read_layout("table_test.json")
Expand Down

0 comments on commit 8846995

Please sign in to comment.