diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..a744c0e Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 6769e21..2ac7656 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ __pycache__/ # Distribution / packaging .Python +_build/ +_static/ +_templates/ build/ develop-eggs/ dist/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..124910a --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index 974cbde..979c832 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,9 @@ You can experiment with the library directly in Google Colab [here](https://cola Here's a [writeup](https://open.substack.com/pub/ambikasukla/p/efficient-rag-with-document-layout?r=ft8uc&utm_campaign=post&utm_medium=web) explaining the problem and our approach. -Here'a another [blog](https://medium.com/@kirankurup/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125) explaining the solution. +Here'a LlamaIndex [blog](https://medium.com/@kirankurup/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125) explaining the need for smart chunking. + +API Reference: [https://llmsherpa.readthedocs.io/](https://llmsherpa.readthedocs.io/) ### Installation @@ -64,6 +66,53 @@ pip install llama-index import openai openai.api_key = # ``` + +### Vector search and Retrieval Augmented Generation with Smart Chunking + +LayoutPDFReader does smart chunking keeping related text due to document structure together: + +* All list items are together including the paragraph that precedes the list. +* Items in a table are chuncked together +* Contextual information from section headers and nested section headers is included + +The following code creates a LlamaIndex query engine from LayoutPDFReader document chunks + +```python +from llama_index.readers.schema.base import Document +from llama_index import VectorStoreIndex + +index = VectorStoreIndex([]) +for chunk in doc.chunks(): + index.insert(Document(text=chunk.to_context_text(), extra_info={})) +query_engine = index.as_query_engine() +``` + +Let's run one query: + +```python +response = query_engine.query("list all the tasks that work with bart") +print(response) +``` + +We get the following response: + +``` +BART works well for text generation, comprehension tasks, abstractive dialogue, question answering, and summarization tasks. +``` + +Let's try another query that needs answer from a table: + +```python +response = query_engine.query("what is the bart performance score on squad") +print(response) +``` + +Here's the response we get: + +``` +The BART performance score on SQuAD is 88.8 for EM and 94.6 for F1. +``` + ### Summarize a Section using prompts LayoutPDFReader offers powerful ways to pick sections and subsections from a large document and use LLMs to extract insights from a section. @@ -179,51 +228,6 @@ R1 of BART for different datasets: - For the XSum dataset, the R1 score of BART is 45.14. ``` -### Vector search and Retrieval Augmented Generation with Smart Chunking - -LayoutPDFReader does smart chunking keeping the integrity of related text together: - -* All list items are together including the paragraph that precedes the list. -* Items in a table are chuncked together -* Contextual information from section headers and nested section headers is included - -The following code creates a LlamaIndex query engine from LayoutPDFReader document chunks - -```python -from llama_index.readers.schema.base import Document -from llama_index import VectorStoreIndex - -index = VectorStoreIndex([]) -for chunk in doc.chunks(): - index.insert(Document(text=chunk.to_context_text(), extra_info={})) -query_engine = index.as_query_engine() -``` - -Let's run one query: - -```python -response = query_engine.query("list all the tasks that work with bart") -print(response) -``` - -We get the following response: - -``` -BART works well for text generation, comprehension tasks, abstractive dialogue, question answering, and summarization tasks. -``` - -Let's try another query that needs answer from a table: - -```python -response = query_engine.query("what is the bart performance score on squad") -print(response) -``` - -Here's the response we get: - -``` -The BART performance score on SQuAD is 88.8 for EM and 94.6 for F1. -``` ### Get the Raw JSON diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..474d92a --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,36 @@ +import os +import sys +sys.path.insert(0, os.path.abspath('../')) +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'LLM Sherpa' +copyright = '2023, Ambika Sukla' +author = 'Ambika Sukla' +release = '0.1.3' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.napoleon', +] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'alabaster' +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..60b6467 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +.. LLM Sherpa documentation master file, created by + sphinx-quickstart on Wed Nov 1 09:09:16 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to LLM Sherpa's documentation! +====================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + API reference + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/llmsherpa.readers.rst b/docs/llmsherpa.readers.rst new file mode 100644 index 0000000..3825dfb --- /dev/null +++ b/docs/llmsherpa.readers.rst @@ -0,0 +1,29 @@ +llmsherpa.readers package +========================= + +Submodules +---------- + +llmsherpa.readers.file\_reader module +------------------------------------- + +.. automodule:: llmsherpa.readers.file_reader + :members: + :undoc-members: + :show-inheritance: + +llmsherpa.readers.layout\_reader module +--------------------------------------- + +.. automodule:: llmsherpa.readers.layout_reader + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: llmsherpa.readers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/llmsherpa.rst b/docs/llmsherpa.rst new file mode 100644 index 0000000..a812297 --- /dev/null +++ b/docs/llmsherpa.rst @@ -0,0 +1,18 @@ +llmsherpa package +================= + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + llmsherpa.readers + +Module contents +--------------- + +.. automodule:: llmsherpa + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules.rst b/docs/modules.rst new file mode 100644 index 0000000..f91605d --- /dev/null +++ b/docs/modules.rst @@ -0,0 +1,7 @@ +llmsherpa +========= + +.. toctree:: + :maxdepth: 4 + + llmsherpa diff --git a/llmsherpa/.DS_Store b/llmsherpa/.DS_Store new file mode 100644 index 0000000..06a5668 Binary files /dev/null and b/llmsherpa/.DS_Store differ diff --git a/llmsherpa/__init__.py b/llmsherpa/__init__.py index 3fa7b91..ecc2de2 100644 --- a/llmsherpa/__init__.py +++ b/llmsherpa/__init__.py @@ -4,6 +4,6 @@ APIs to accelerate LLM use cases. """ -__version__ = "0.1.2" +__version__ = "0.1.3" __author__ = 'Ambika Sukla' __credits__ = 'NLMATICS CORP.' \ No newline at end of file diff --git a/llmsherpa/readers/file_reader.py b/llmsherpa/readers/file_reader.py index 5abc97f..ff18347 100644 --- a/llmsherpa/readers/file_reader.py +++ b/llmsherpa/readers/file_reader.py @@ -5,12 +5,30 @@ from llmsherpa.readers import Document class LayoutPDFReader: + """ + Reads PDF content and understands hierarchical layout of the document sections and structural components such as paragraphs, sentences, tables, lists, sublists + + Parameters + ---------- + parser_api_url: str + API url for LLM Sherpa. Use customer url for your private instance here + + """ def __init__(self, parser_api_url): + """ + Constructs a LayoutPDFReader from a parser endpoint. + + Parameters + ---------- + parser_api_url: str + API url for LLM Sherpa. Use customer url for your private instance here + """ self.parser_api_url = parser_api_url self.download_connection = urllib3.PoolManager() self.api_connection = urllib3.PoolManager() def _download_pdf(self, pdf_url): + # some servers only allow browers user_agent to download user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" # add authorization headers if using external API (see upload_pdf for an example) @@ -28,6 +46,16 @@ def _parse_pdf(self, pdf_file): return parser_response def read_pdf(self, path_or_url, contents=None): + """ + Reads pdf from a url or path + + Parameters + ---------- + path_or_url: str + path or url to the pdf file e.g. https://someexapmple.com/myfile.pdf or /home/user/myfile.pdf + contents: bytes + contents of the pdf file. If contents is given, path_or_url is ignored. This is useful when you already have the pdf file contents in memory such as if you are using streamlit or flask. + """ # file contents were given if contents is not None: pdf_file = (path_or_url, contents, 'application/pdf') @@ -43,18 +71,4 @@ def read_pdf(self, path_or_url, contents=None): parser_response = self._parse_pdf(pdf_file) response_json = json.loads(parser_response.data.decode("utf-8")) blocks = response_json['return_dict']['result']['blocks'] - return Document(blocks) - # def read_file(file_path): - -def main(): - llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all" - pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" - pdf_url = "/Users/ambikasukla/Documents/1910.13461.pdf" - pdf_reader = LayoutPDFReader(llmsherpa_api_url) - doc = pdf_reader.read_pdf(pdf_url) - print(doc.sections()[5].to_html(include_children=True, recurse=True)) - -# Using the special variable -# __name__ -if __name__=="__main__": - main() + return Document(blocks) \ No newline at end of file diff --git a/llmsherpa/readers/layout_reader.py b/llmsherpa/readers/layout_reader.py index 82c82da..82e64a3 100644 --- a/llmsherpa/readers/layout_reader.py +++ b/llmsherpa/readers/layout_reader.py @@ -1,23 +1,67 @@ class Block: + """ + A block is a node in the layout tree. It can be a paragraph, a list item, a table, or a section header. + This is the base class for all blocks such as Paragraph, ListItem, Table, Section. + + Attributes + ---------- + tag: str + tag of the block e.g. para, list_item, table, header + level: int + level of the block in the layout tree + page_idx: int + page index of the block in the document. It starts from 0 and is -1 if the page number is not available + block_idx: int + id of the block as returned from the server. It starts from 0 and is -1 if the id is not available + top: float + top position of the block in the page and it is -1 if the position is not available + left: float + left position of the block in the page and it is -1 if the position is not available + sentences: list + list of sentences in the block + children: list + list of immediate child blocks, but not the children of the children + parent: Block + parent of the block + block_json: dict + json returned by the parser API for the block + """ tag: str def __init__(self, block_json=None): self.tag = block_json['tag'] if block_json and 'tag' in block_json else None self.level = block_json['level'] if block_json and 'level' in block_json else -1 + self.page_idx = block_json['page_idx'] if block_json and 'page_idx' in block_json else -1 + self.block_idx = block_json['block_idx'] if block_json and 'block_idx' in block_json else -1 + self.top = block_json['top'] if block_json and 'top' in block_json else -1 + self.left = block_json['left'] if block_json and 'left' in block_json else -1 self.sentences = block_json['sentences'] if block_json and 'sentences' in block_json else [] self.children = [] self.parent = None + self.block_json = block_json def add_child(self, node): + """ + Adds a child to the block. Sets the parent of the child to self. + """ self.children.append(node) node.parent = self def to_html(self, include_children=False, recurse=False): + """ + Converts the block to html. This is a virtual method and should be implemented by the derived classes. + """ pass def to_text(self, include_children=False, recurse=False): + """ + Converts the block to text. This is a virtual method and should be implemented by the derived classes. + """ pass def parent_chain(self): + """ + Returns the parent chain of the block consisting of all the parents of the block until the root. + """ chain = [] parent = self.parent while parent: @@ -27,6 +71,9 @@ def parent_chain(self): return chain def parent_text(self): + """ + Returns the text of the parent chain of the block. This is useful for adding section information to the text. + """ parent_chain = self.parent_chain() header_texts = [] para_texts = [] @@ -41,6 +88,9 @@ def parent_text(self): return text def to_context_text(self, include_section_info=True): + """ + Returns the text of the block with section information. This provides context to the text. + """ text = "" if include_section_info: text += self.parent_text() + "\n" @@ -51,6 +101,9 @@ def to_context_text(self, include_section_info=True): return text def iter_children(self, node, level, node_visitor): + """ + Iterates over all the children of the node and calls the node_visitor function on each child. + """ for child in node.children: node_visitor(child) # print("-"*level, child.tag, f"({len(child.children)})", child.to_text()) @@ -58,6 +111,9 @@ def iter_children(self, node, level, node_visitor): self.iter_children(child, level + 1, node_visitor) def paragraphs(self): + """ + Returns all the paragraphs in the block. This is useful for getting all the paragraphs in a section. + """ paragraphs = [] def para_collector(node): if node.tag == 'para': @@ -66,6 +122,9 @@ def para_collector(node): return paragraphs def chunks(self): + """ + Returns all the chunks in the block. Chunking automatically splits the document into paragraphs, lists, and tables without any prior knowledge of the document structure. + """ chunks = [] def chunk_collector(node): if node.tag in ['para', 'list_item', 'table']: @@ -74,14 +133,20 @@ def chunk_collector(node): return chunks def tables(self): + """ + Returns all the tables in the block. This is useful for getting all the tables in a section. + """ tables = [] def chunk_collector(node): if node.tag in ['table']: tables.append(node) - self.iter_children(self, 0, chunk_collector) + self.iter_children(self, 0, chunk_collector) return tables def sections(self): + """ + Returns all the sections in the block. This is useful for getting all the sections in a document. + """ sections = [] def chunk_collector(node): if node.tag in ['header']: @@ -90,15 +155,38 @@ def chunk_collector(node): return sections class Paragraph(Block): + """ + A paragraph is a block of text. It can have children such as lists. A paragraph has tag 'para'. + """ def __init__(self, para_json): super().__init__(para_json) def to_text(self, include_children=False, recurse=False): + """ + Converts the paragraph to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included. + + Parameters + ---------- + include_children: bool + If True, then the text of the children are also included + recurse: bool + If True, then the text of the children's children are also included + """ para_text = "\n".join(self.sentences) if include_children: for child in self.children: para_text += "\n" + child.to_text(include_children=recurse, recurse=recurse) return para_text def to_html(self, include_children=False, recurse=False): + """ + Converts the paragraph to html. If include_children is True, then the html of the children is also included. If recurse is True, then the html of the children's children are also included. + + Parameters + ---------- + include_children: bool + If True, then the html of the children are also included + recurse: bool + If True, then the html of the children's children are also included + """ html_str = "

" html_str = html_str + "\n".join(self.sentences) if include_children: @@ -111,10 +199,28 @@ def to_html(self, include_children=False, recurse=False): return html_str class Section(Block): + """ + A section is a block of text. It can have children such as paragraphs, lists, and tables. A section has tag 'header'. + + Attributes + ---------- + title: str + title of the section + """ def __init__(self, section_json): super().__init__(section_json) self.title = "\n".join(self.sentences) def to_text(self, include_children=False, recurse=False): + """ + Converts the section to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included. + + Parameters + ---------- + include_children: bool + If True, then the text of the children are also included + recurse: bool + If True, then the text of the children's children are also included + """ text = self.title if include_children: for child in self.children: @@ -122,6 +228,16 @@ def to_text(self, include_children=False, recurse=False): return text def to_html(self, include_children=False, recurse=False): + """ + Converts the section to html. If include_children is True, then the html of the children is also included. If recurse is True, then the html of the children's children are also included. + + Parameters + ---------- + include_children: bool + If True, then the html of the children are also included + recurse: bool + If True, then the html of the children's children are also included + """ html_str = f"" html_str = html_str + self.title html_str = html_str + f"" @@ -131,10 +247,23 @@ def to_html(self, include_children=False, recurse=False): return html_str class ListItem(Block): + """ + A list item is a block of text. It can have child list items. A list item has tag 'list_item'. + """ def __init__(self, list_json): super().__init__(list_json) def to_text(self, include_children=False, recurse=False): + """ + Converts the list item to text. If include_children is True, then the text of the children is also included. If recurse is True, then the text of the children's children are also included. + + Parameters + ---------- + include_children: bool + If True, then the text of the children are also included + recurse: bool + If True, then the text of the children's children are also included + """ text = "\n".join(self.sentences) if include_children: for child in self.children: @@ -142,6 +271,16 @@ def to_text(self, include_children=False, recurse=False): return text def to_html(self, include_children=False, recurse=False): + """ + Converts the list item to html. If include_children is True, then the html of the children is also included. If recurse is True, then the html of the children's children are also included. + + Parameters + ---------- + include_children: bool + If True, then the html of the children are also included + recurse: bool + If True, then the html of the children's children are also included + """ html_str = f"

  • " html_str = html_str + "\n".join(self.sentences) if include_children: @@ -153,11 +292,12 @@ def to_html(self, include_children=False, recurse=False): html_str = html_str + f"
  • " return html_str -class List(Block): - def __init__(self, list_json): - self.x = 0 class TableCell(Block): + """ + A table cell is a block of text. It can have child paragraphs. A table cell has tag 'table_cell'. + A table cell is contained within table rows. + """ def __init__(self, cell_json): super().__init__(cell_json) self.col_span = cell_json['col_span'] if 'col_span' in cell_json else 1 @@ -167,11 +307,17 @@ def __init__(self, cell_json): else: self.cell_node = None def to_text(self): + """ + Returns the cell value of the text. If the cell value is a paragraph node, then the text of the node is returned. + """ cell_text = self.cell_value if self.cell_node: cell_text = self.cell_node.to_text() return cell_text def to_html(self): + """ + Returns the cell value ashtml. If the cell value is a paragraph node, then the html of the node is returned. + """ cell_html = self.cell_value if self.cell_node: cell_html = self.cell_node.to_html() @@ -182,6 +328,9 @@ def to_html(self): return html_str class TableRow(Block): + """ + A table row is a block of text. It can have child table cells. + """ def __init__(self, row_json): self.cells = [] if row_json['type'] == 'full_row': @@ -192,11 +341,17 @@ def __init__(self, row_json): cell = TableCell(cell_json) self.cells.append(cell) def to_text(self, include_children=False, recurse=False): + """ + Returns text of a row with text from all the cells in the row delimited by '|' + """ cell_text = "" for cell in self.cells: cell_text = cell_text + " | " + cell.to_text() return cell_text def to_html(self, include_children=False, recurse=False): + """ + Returns html for a with html from all the cells in the row as + """ html_str = "" for cell in self.cells: html_str = html_str + cell.to_html() @@ -204,6 +359,9 @@ def to_html(self, include_children=False, recurse=False): return html_str class TableHeader(Block): + """ + A table header is a block of text. It can have child table cells. + """ def __init__(self, row_json): super().__init__(row_json) self.cells = [] @@ -211,6 +369,10 @@ def __init__(self, row_json): cell = TableCell(cell_json) self.cells.append(cell) def to_text(self, include_children=False, recurse=False): + """ + Returns text of a row with text from all the cells in the row delimited by '|' and the header row is delimited by '---' + Text is returned in markdown format. + """ cell_text = "" for cell in self.cells: cell_text = cell_text + " | " + cell.to_text() @@ -219,13 +381,19 @@ def to_text(self, include_children=False, recurse=False): cell_text = cell_text + " | " + "---" return cell_text def to_html(self, include_children=False, recurse=False): - html_str = "" - for cell in self.cells: - html_str = html_str + cell.to_html() - html_str = html_str + "" - return html_str + """ + Returns html for a with html from all the cells in the row as + """ + html_str = "" + for cell in self.cells: + html_str = html_str + cell.to_html() + html_str = html_str + "" + return html_str class Table(Block): + """ + A table is a block of text. It can have child table rows. A table has tag 'table'. + """ def __init__(self, table_json, parent): # self.title = parent.name super().__init__(table_json) @@ -241,6 +409,9 @@ def __init__(self, table_json, parent): row = TableRow(row_json) self.rows.append(row) def to_text(self, include_children=False, recurse=False): + """ + Returns text of a table with text from all the rows in the table delimited by '\n' + """ text = "" for header in self.headers: text = text + header.to_text() + "\n" @@ -249,6 +420,9 @@ def to_text(self, include_children=False, recurse=False): return text def to_html(self, include_children=False, recurse=False): + """ + Returns html for a with html from all the rows in the table as + """ html_str = "
    " for header in self.headers: html_str = html_str + header.to_html() @@ -258,6 +432,9 @@ def to_html(self, include_children=False, recurse=False): return html_str class LayoutReader: + """ + Reads the layout tree from the json returned by the parser API. + """ def debug(self, pdf_root): def iter_children(node, level): for child in node.children: @@ -266,12 +443,11 @@ def iter_children(node, level): iter_children(pdf_root, 0) def read(self, blocks_json): + """ + Reads the layout tree from the json returned by the parser API. Constructs a tree of Block objects. + """ root = Block() parent = None - # table_node = None - table_nodes = [] - sections = [] - # prev_list = None parent_stack = [root] prev_node = root parent = root @@ -320,13 +496,25 @@ def read(self, blocks_json): return root class Document: + """ + A document is a tree of blocks. It is the root node of the layout tree. + """ def __init__(self, blocks_json): self.reader = LayoutReader() self.root_node = self.reader.read(blocks_json) self.json = blocks_json def chunks(self): + """ + Returns all the chunks in the document. Chunking automatically splits the document into paragraphs, lists, and tables without any prior knowledge of the document structure. + """ return self.root_node.chunks() def tables(self): + """ + Returns all the tables in the document. This is useful for getting all the tables in a document. + """ return self.root_node.tables() def sections(self): + """ + Returns all the sections in the document. This is useful for getting all the sections in a document. + """ return self.root_node.sections() diff --git a/llmsherpa/readers/tests/test_layout_reader.py b/llmsherpa/readers/tests/test_layout_reader.py index 92f959d..e2f34f5 100644 --- a/llmsherpa/readers/tests/test_layout_reader.py +++ b/llmsherpa/readers/tests/test_layout_reader.py @@ -119,6 +119,14 @@ def test_chunk_iterator(self): correct_text = self.clean_text(correct_text) self.assertEqual(chunks[3].to_context_text(), correct_text) + def test_meta_data(self): + doc = self.read_layout("table_test.json") + chunks = doc.chunks() + + self.assertEqual(chunks[0].page_idx, 5) + self.assertEqual(chunks[0].block_idx, 112) + self.assertEqual(chunks[0].top, 64.8) + self.assertEqual(chunks[0].left, 130.05) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/setup.py b/setup.py index d7c3ac5..e86089f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='llmsherpa', - version='0.1.2', + version='0.1.3', description='Strategic APIs to Accelerate LLM Use Cases', long_description=open('README.md').read(), long_description_content_type='text/markdown',