Skip to content

Commit

Permalink
Added test case for to_text and to_html methods
Browse files Browse the repository at this point in the history
  • Loading branch information
AaryanTR authored and ansukla committed Jun 13, 2024
1 parent 89e4bc7 commit 0e6007f
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 0 deletions.
19 changes: 19 additions & 0 deletions llmsherpa/readers/tests/test_layout_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
from llmsherpa.readers import LayoutReader
from llmsherpa.readers import Document


class TestLayoutReader(unittest.TestCase):
Expand All @@ -20,6 +21,12 @@ def read_layout(self, file_name):
doc = reader.read(doc_data)
# reader.debug(pdf)
return doc

def get_document(self, file_name):
with open(os.path.join(os.path.dirname(__file__), file_name)) as f:
doc_data = json.load(f)
doc = Document(doc_data)
return doc

def test_list_child_of_header(self):
pdf = self.read_layout("list_test.json")
Expand Down Expand Up @@ -128,5 +135,17 @@ def test_meta_data(self):
self.assertEqual(chunks[0].top, 64.8)
self.assertEqual(chunks[0].left, 130.05)

def test_to_text(self):
doc = self.get_document("to_text_test.json")

correct_text = "Lecture notes\nCS229\nPart VI\n"
self.assertEqual(doc.to_text(), correct_text)

def test_to_html(self):
doc = self.get_document("to_html_test.json")

correct_html = "<html><h1>Heading 1</h1><h2>Heading 2</h2><h2>Heading 3</h2></html>"
self.assertEqual(doc.to_html(), correct_html)

if __name__ == '__main__':
unittest.main()
32 changes: 32 additions & 0 deletions llmsherpa/readers/tests/to_html_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"block_class": "cls_0",
"block_idx": 0,
"level": 0,
"page_idx": 0,
"sentences": [
"Heading 1"
],
"tag": "header"
},
{
"block_class": "cls_1",
"block_idx": 1,
"level": 1,
"page_idx": 0,
"sentences": [
"Heading 2"
],
"tag": "header"
},
{
"block_class": "cls_2",
"block_idx": 2,
"level": 1,
"page_idx": 0,
"sentences": [
"Heading 3"
],
"tag": "header"
}
]
32 changes: 32 additions & 0 deletions llmsherpa/readers/tests/to_text_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"block_class": "cls_0",
"block_idx": 0,
"level": 0,
"page_idx": 0,
"sentences": [
"Lecture notes"
],
"tag": "header"
},
{
"block_class": "cls_1",
"block_idx": 1,
"level": 1,
"page_idx": 0,
"sentences": [
"CS229"
],
"tag": "header"
},
{
"block_class": "cls_2",
"block_idx": 2,
"level": 1,
"page_idx": 0,
"sentences": [
"Part VI"
],
"tag": "header"
}
]

0 comments on commit 0e6007f

Please sign in to comment.