Added new PDF reader (fitz) (#126)

Future-House · May 28, 2023 · 1dc6ad0 · 1dc6ad0
1 parent 6a6c4f6
commit 1dc6ad0
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ pip install paper-qa
 
 Make sure you have set your OPENAI_API_KEY environment variable to your [openai api key](https://platform.openai.com/account/api-keys)
 
-To use paper-qa, you need to have a list of paths (valid extensions include: .pdf, .txt) and a list of citations (strings) that correspond to the paths. You can then use the `Docs` class to add the documents and then query them.
+To use paper-qa, you need to have a list of paths (valid extensions include: .pdf, .txt) and a list of citations (strings) that correspond to the paths. You can then use the `Docs` class to add the documents and then query them. If you don't have citations, `Docs` will try to guess them from the first page of your docs.
 
 ```python
 
@@ -60,7 +60,7 @@ from paperqa import Docs
 docs = Docs()
 for d in my_docs:
     docs.add(d)
-    
+
 answer = docs.query("What manufacturing challenges are unique to bispecific antibodies?")
 print(answer.formatted_answer)
 ```
@@ -74,6 +74,17 @@ By default, it uses a hybrid of `gpt-3.5-turbo` and `gpt-4`. If you don't have g
 ```py
 docs = Docs(llm='gpt-3.5-turbo')
 ```
+
+or you can use any other model available in [langchain](https://github.com/hwchase17/langchain):
+
+```py
+from langchain.llms import Anthropic, OpenAIChat
+model = OpenAIChat(model='gpt-4')
+summary_model = Anthropic(model="claude-instant-v1-100k", anthropic_api_key="my-api-key")
+docs = Docs(llm=model, summary_llm=summary_model)
+```
+
+
 #### Locally Hosted
 
 You can also use any other models (or embeddings) available in [langchain](https://github.com/hwchase17/langchain). Here's an example of using `llama.cpp` to have locally hosted paper-qa:
@@ -210,7 +221,7 @@ We can also do specific queries of our Zotero library and iterate over the resul
 ```py
 for item in zotero.iterate(
         q="large language models",
-        qmode="everything", 
+        qmode="everything",
         sort="date",
         direction="desc",
         limit=100,
@@ -278,6 +289,14 @@ with open("my_docs.pkl", "rb") as f:
     docs = pickle.load(f)
 ```
 
+### PDF Reading Options
+
+By default [PyPDF](https://pypi.org/project/pypdf/) is used since it's pure python and easy to install. For faster PDF reading, paper-qa will detect and use [PymuPDF (fitz)](https://pymupdf.readthedocs.io/en/latest/):
+
+```sh
+pip install pymupdf
+```
+
 ### Callbacks
 
 TODO
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -3,4 +3,5 @@ pre-commit
 requests
 paper-scraper@git+https://github.com/blackadad/paper-scraper.git
 pyzotero
-python-dotenv
+python-dotenv
+pymupdf
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -27,6 +27,57 @@ def _get_ocr_cache() -> SQLiteCache:
 TextSplitter = TokenTextSplitter
 
 
+def clear_cache():
+    """Clear the OCR cache."""
+    # TODO: upstream broken
+    # _get_ocr_cache().clear()
+    global OCR_CACHE
+    OCR_CACHE = None
+    os.unlink(OCR_CACHE_PATH)
+
+
+def parse_pdf_fitz(path, citation, key, chunk_chars, overlap):
+    import fitz
+
+    doc = fitz.open(path)
+    splits = []
+    split = ""
+    pages = []
+    metadatas = []
+    for i in range(doc.page_count):
+        page = doc.load_page(i)
+        split += page.get_text("text", sort=True)
+        pages.append(str(i + 1))
+        # split could be so long it needs to be split
+        # into multiple chunks. Or it could be so short
+        # that it needs to be combined with the next chunk.
+        while len(split) > chunk_chars:
+            splits.append(split[:chunk_chars])
+            # pretty formatting of pages (e.g. 1-3, 4, 5-7)
+            pg = "-".join([pages[0], pages[-1]])
+            metadatas.append(
+                dict(
+                    citation=citation,
+                    dockey=key,
+                    key=f"{key} pages {pg}",
+                )
+            )
+            split = split[chunk_chars - overlap :]
+            pages = [str(i + 1)]
+    if len(split) > overlap:
+        splits.append(split[:chunk_chars])
+        pg = "-".join([pages[0], pages[-1]])
+        metadatas.append(
+            dict(
+                citation=citation,
+                dockey=key,
+                key=f"{key} pages {pg}",
+            )
+        )
+    doc.close()
+    return splits, metadatas
+
+
 def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
     import pypdf
 
@@ -211,12 +262,27 @@ def read_doc(path, citation, key, chunk_chars=3000, overlap=100, disable_check=F
     return out
 
 
-def _read_doc(path, citation, key, chunk_chars=3000, overlap=100, disable_check=False):
+def _read_doc(
+    path,
+    citation,
+    key,
+    chunk_chars=3000,
+    overlap=100,
+    disable_check=False,
+    force_pypdf=False,
+):
     """Parse a document into chunks."""
     if isinstance(path, Path):
         path = str(path)
     if path.endswith(".pdf"):
-        return parse_pdf(path, citation, key, chunk_chars, overlap)
+        if force_pypdf:
+            return parse_pdf(path, citation, key, chunk_chars, overlap)
+        try:
+            import fitz
+
+            return parse_pdf_fitz(path, citation, key, chunk_chars, overlap)
+        except ImportError:
+            return parse_pdf(path, citation, key, chunk_chars, overlap)
     elif path.endswith(".txt"):
         return parse_txt(path, citation, key, chunk_chars, overlap)
     elif path.endswith(".html"):

diff --git a/paperqa/version.py b/paperqa/version.py
@@ -1 +1 @@
-__version__ = "1.10.0"
+__version__ = "1.11.0"
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -1,7 +1,11 @@
 import os
 import pickle
+import sys
 from typing import Any
 from unittest import IsolatedAsyncioTestCase
+from unittest import mock
+from importlib import reload
+from importlib import import_module
 
 import requests
 from langchain.callbacks.base import AsyncCallbackHandler
@@ -10,6 +14,7 @@
 
 import paperqa
 from paperqa.utils import strings_similarity
+from paperqa.readers import clear_cache
 
 
 class TestHandler(AsyncCallbackHandler):
@@ -241,6 +246,7 @@ def test_repeat_keys():
 
 
 def test_pdf_reader():
+    clear_cache()
     tests_dir = os.path.dirname(os.path.abspath(__file__))
     doc_path = os.path.join(tests_dir, "paper.pdf")
     docs = paperqa.Docs(llm=OpenAI(temperature=0.0, model_name="text-curie-001"))
@@ -249,6 +255,18 @@ def test_pdf_reader():
     assert "yes" in answer.answer or "Yes" in answer.answer
 
 
+def test_pdf_pypdf_reader():
+    tests_dir = os.path.dirname(os.path.abspath(__file__))
+    doc_path = os.path.join(tests_dir, "paper.pdf")
+    splits1, _ = paperqa.readers._read_doc(
+        doc_path, "foo te al", "bar", force_pypdf=True
+    )
+    splits2, _ = paperqa.readers._read_doc(
+        doc_path, "foo te al", "bar", force_pypdf=False
+    )
+    assert strings_similarity(splits1[0].casefold(), splits2[0].casefold()) > 0.85
+
+
 def test_prompt_length():
     doc_path = "example.txt"
     with open(doc_path, "w", encoding="utf-8") as f: