Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Did one revision at README #344

Merged
merged 10 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 116 additions & 62 deletions README.md

Large diffs are not rendered by default.

41 changes: 3 additions & 38 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import Literal, overload

import fitz
import tiktoken

try:
Expand All @@ -17,8 +18,7 @@
from paperqa.version import __version__ as pqa_version


def parse_pdf_fitz_to_pages(path: Path) -> ParsedText:
import fitz
def parse_pdf_to_pages(path: Path) -> ParsedText:

with fitz.open(path) as file:
pages: dict[str, str] = {}
Expand All @@ -38,29 +38,6 @@ def parse_pdf_fitz_to_pages(path: Path) -> ParsedText:
return ParsedText(content=pages, metadata=metadata)


def parse_pdf_to_pages(path: Path) -> ParsedText:
import pypdf

with path.open("rb") as pdfFileObj:
pdfReader = pypdf.PdfReader(pdfFileObj)
pages: dict[str, str] = {}
total_length = 0

for i, page in enumerate(pdfReader.pages):
pages[str(i + 1)] = page.extract_text()
total_length += len(pages[str(i + 1)])

return ParsedText(
content=pages,
metadata=ParsedMetadata(
parsing_libraries=[f"pypdf ({pypdf.__version__})"],
paperqa_version=pqa_version,
total_parsed_text_length=total_length,
parse_type="pdf",
),
)


def chunk_pdf(
parsed_text: ParsedText, doc: Doc, chunk_chars: int, overlap: int
) -> list[Text]:
Expand Down Expand Up @@ -232,7 +209,6 @@ def read_doc(
include_metadata: Literal[False],
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> list[Text]: ...


Expand All @@ -244,7 +220,6 @@ def read_doc(
include_metadata: Literal[False] = ...,
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> list[Text]: ...


Expand All @@ -256,7 +231,6 @@ def read_doc(
include_metadata: bool = ...,
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> ParsedText: ...


Expand All @@ -268,7 +242,6 @@ def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> tuple[list[Text], ParsedMetadata]: ...


Expand All @@ -279,7 +252,6 @@ def read_doc(
include_metadata: bool = False,
chunk_chars: int = 3000,
overlap: int = 100,
force_pypdf: bool = False,
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
"""Parse a document and split into chunks.

Expand All @@ -290,7 +262,6 @@ def read_doc(
doc: object with document metadata
chunk_chars: size of chunks
overlap: size of overlap between chunks
force_pypdf: flag to force use of pypdf in parsing
parsed_text_only: return parsed text without chunking
include_metadata: return a tuple
"""
Expand All @@ -299,13 +270,7 @@ def read_doc(

# start with parsing -- users may want to store this separately
if str_path.endswith(".pdf"):
if force_pypdf:
parsed_text = parse_pdf_to_pages(path)
else:
try:
parsed_text = parse_pdf_fitz_to_pages(path)
except ImportError:
parsed_text = parse_pdf_to_pages(path)
parsed_text = parse_pdf_to_pages(path)

elif str_path.endswith(".txt"):
parsed_text = parse_text(path)
Expand Down
14 changes: 3 additions & 11 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
from uuid import UUID

import aiohttp
import fitz
import httpx
import litellm
import pypdf
from pybtex.database import Person, parse_string
from pybtex.database.input.bibtex import Parser
from pybtex.style.formatting import unsrtalpha
Expand Down Expand Up @@ -82,16 +82,8 @@ def strings_similarity(s1: str, s2: str) -> float:


def count_pdf_pages(file_path: StrPath) -> int:
with open(file_path, "rb") as pdf_file:
try: # try fitz by default
import fitz

doc = fitz.open(file_path)
num_pages = len(doc)
except ModuleNotFoundError: # pypdf instead
pdf_reader = pypdf.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
return num_pages
with fitz.open(file_path) as doc:
return len(doc)


def hexdigest(data: str | bytes) -> str:
Expand Down
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies = [
"pybtex",
"pydantic-settings",
"pydantic~=2.0",
"pypdf",
"pymupdf",
"setuptools", # TODO: remove after release of https://bitbucket.org/pybtex-devs/pybtex/pull-requests/46/replace-pkg_resources-with-importlib
"tenacity",
"tiktoken>=0.4.0",
Expand All @@ -50,8 +50,6 @@ agents = [
"langchain-community",
"langchain-core",
"langchain-openai",
"pymupdf",
"pymupdf",
"tantivy",
"typer",
"typing_extensions",
Expand Down
23 changes: 1 addition & 22 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
maybe_is_html,
maybe_is_text,
name_in_text,
strings_similarity,
strip_citations,
)

Expand Down Expand Up @@ -861,33 +860,14 @@ def test_fileio_reader_txt(stub_data_dir: Path) -> None:
assert "United States" in answer.answer


def test_pdf_pypdf_reader(stub_data_dir: Path) -> None:
doc_path = stub_data_dir / "paper.pdf"
splits1 = read_doc(
Path(doc_path),
Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
force_pypdf=True,
)
splits2 = read_doc(
Path(doc_path),
Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
)
assert (
strings_similarity(splits1[0].text.casefold(), splits2[0].text.casefold())
> 0.85
)


def test_parser_only_reader(stub_data_dir: Path) -> None:
def test_parser_only_reader(stub_data_dir: Path):
doc_path = stub_data_dir / "paper.pdf"
parsed_text = read_doc(
Path(doc_path),
Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
force_pypdf=True,
parsed_text_only=True,
)
assert parsed_text.metadata.parse_type == "pdf"
assert any("pypdf" in t for t in parsed_text.metadata.parsing_libraries)
assert parsed_text.metadata.chunk_metadata is None
assert parsed_text.metadata.total_parsed_text_length == sum(
len(t) for t in parsed_text.content.values() # type: ignore[misc,union-attr]
Expand All @@ -899,7 +879,6 @@ def test_chunk_metadata_reader(stub_data_dir: Path) -> None:
chunk_text, metadata = read_doc(
Path(doc_path),
Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
force_pypdf=True,
parsed_text_only=False, # noqa: FURB120
include_metadata=True,
)
Expand Down
Loading