Skip to content

Commit

Permalink
Added new PDF reader (fitz) (#126)
Browse files Browse the repository at this point in the history
  • Loading branch information
whitead authored May 28, 2023
1 parent 6a6c4f6 commit 1dc6ad0
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 7 deletions.
25 changes: 22 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pip install paper-qa

Make sure you have set your OPENAI_API_KEY environment variable to your [openai api key](https://platform.openai.com/account/api-keys)

To use paper-qa, you need to have a list of paths (valid extensions include: .pdf, .txt) and a list of citations (strings) that correspond to the paths. You can then use the `Docs` class to add the documents and then query them.
To use paper-qa, you need to have a list of paths (valid extensions include: .pdf, .txt) and a list of citations (strings) that correspond to the paths. You can then use the `Docs` class to add the documents and then query them. If you don't have citations, `Docs` will try to guess them from the first page of your docs.

```python

Expand All @@ -60,7 +60,7 @@ from paperqa import Docs
docs = Docs()
for d in my_docs:
docs.add(d)

answer = docs.query("What manufacturing challenges are unique to bispecific antibodies?")
print(answer.formatted_answer)
```
Expand All @@ -74,6 +74,17 @@ By default, it uses a hybrid of `gpt-3.5-turbo` and `gpt-4`. If you don't have g
```py
docs = Docs(llm='gpt-3.5-turbo')
```

or you can use any other model available in [langchain](https://github.com/hwchase17/langchain):

```py
from langchain.llms import Anthropic, OpenAIChat
model = OpenAIChat(model='gpt-4')
summary_model = Anthropic(model="claude-instant-v1-100k", anthropic_api_key="my-api-key")
docs = Docs(llm=model, summary_llm=summary_model)
```


#### Locally Hosted

You can also use any other models (or embeddings) available in [langchain](https://github.com/hwchase17/langchain). Here's an example of using `llama.cpp` to have locally hosted paper-qa:
Expand Down Expand Up @@ -210,7 +221,7 @@ We can also do specific queries of our Zotero library and iterate over the resul
```py
for item in zotero.iterate(
q="large language models",
qmode="everything",
qmode="everything",
sort="date",
direction="desc",
limit=100,
Expand Down Expand Up @@ -278,6 +289,14 @@ with open("my_docs.pkl", "rb") as f:
docs = pickle.load(f)
```

### PDF Reading Options

By default [PyPDF](https://pypi.org/project/pypdf/) is used since it's pure python and easy to install. For faster PDF reading, paper-qa will detect and use [PymuPDF (fitz)](https://pymupdf.readthedocs.io/en/latest/):

```sh
pip install pymupdf
```

### Callbacks

TODO
3 changes: 2 additions & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ pre-commit
requests
paper-scraper@git+https://github.com/blackadad/paper-scraper.git
pyzotero
python-dotenv
python-dotenv
pymupdf
70 changes: 68 additions & 2 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,57 @@ def _get_ocr_cache() -> SQLiteCache:
TextSplitter = TokenTextSplitter


def clear_cache():
"""Clear the OCR cache."""
# TODO: upstream broken
# _get_ocr_cache().clear()
global OCR_CACHE
OCR_CACHE = None
os.unlink(OCR_CACHE_PATH)


def parse_pdf_fitz(path, citation, key, chunk_chars, overlap):
import fitz

doc = fitz.open(path)
splits = []
split = ""
pages = []
metadatas = []
for i in range(doc.page_count):
page = doc.load_page(i)
split += page.get_text("text", sort=True)
pages.append(str(i + 1))
# split could be so long it needs to be split
# into multiple chunks. Or it could be so short
# that it needs to be combined with the next chunk.
while len(split) > chunk_chars:
splits.append(split[:chunk_chars])
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
pg = "-".join([pages[0], pages[-1]])
metadatas.append(
dict(
citation=citation,
dockey=key,
key=f"{key} pages {pg}",
)
)
split = split[chunk_chars - overlap :]
pages = [str(i + 1)]
if len(split) > overlap:
splits.append(split[:chunk_chars])
pg = "-".join([pages[0], pages[-1]])
metadatas.append(
dict(
citation=citation,
dockey=key,
key=f"{key} pages {pg}",
)
)
doc.close()
return splits, metadatas


def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
import pypdf

Expand Down Expand Up @@ -211,12 +262,27 @@ def read_doc(path, citation, key, chunk_chars=3000, overlap=100, disable_check=F
return out


def _read_doc(path, citation, key, chunk_chars=3000, overlap=100, disable_check=False):
def _read_doc(
path,
citation,
key,
chunk_chars=3000,
overlap=100,
disable_check=False,
force_pypdf=False,
):
"""Parse a document into chunks."""
if isinstance(path, Path):
path = str(path)
if path.endswith(".pdf"):
return parse_pdf(path, citation, key, chunk_chars, overlap)
if force_pypdf:
return parse_pdf(path, citation, key, chunk_chars, overlap)
try:
import fitz

return parse_pdf_fitz(path, citation, key, chunk_chars, overlap)
except ImportError:
return parse_pdf(path, citation, key, chunk_chars, overlap)
elif path.endswith(".txt"):
return parse_txt(path, citation, key, chunk_chars, overlap)
elif path.endswith(".html"):
Expand Down
2 changes: 1 addition & 1 deletion paperqa/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.10.0"
__version__ = "1.11.0"
18 changes: 18 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import os
import pickle
import sys
from typing import Any
from unittest import IsolatedAsyncioTestCase
from unittest import mock
from importlib import reload
from importlib import import_module

import requests
from langchain.callbacks.base import AsyncCallbackHandler
Expand All @@ -10,6 +14,7 @@

import paperqa
from paperqa.utils import strings_similarity
from paperqa.readers import clear_cache


class TestHandler(AsyncCallbackHandler):
Expand Down Expand Up @@ -241,6 +246,7 @@ def test_repeat_keys():


def test_pdf_reader():
clear_cache()
tests_dir = os.path.dirname(os.path.abspath(__file__))
doc_path = os.path.join(tests_dir, "paper.pdf")
docs = paperqa.Docs(llm=OpenAI(temperature=0.0, model_name="text-curie-001"))
Expand All @@ -249,6 +255,18 @@ def test_pdf_reader():
assert "yes" in answer.answer or "Yes" in answer.answer


def test_pdf_pypdf_reader():
tests_dir = os.path.dirname(os.path.abspath(__file__))
doc_path = os.path.join(tests_dir, "paper.pdf")
splits1, _ = paperqa.readers._read_doc(
doc_path, "foo te al", "bar", force_pypdf=True
)
splits2, _ = paperqa.readers._read_doc(
doc_path, "foo te al", "bar", force_pypdf=False
)
assert strings_similarity(splits1[0].casefold(), splits2[0].casefold()) > 0.85


def test_prompt_length():
doc_path = "example.txt"
with open(doc_path, "w", encoding="utf-8") as f:
Expand Down

0 comments on commit 1dc6ad0

Please sign in to comment.