From 72ef41547aed158f6e5fe25ffc503ff17aded4ab Mon Sep 17 00:00:00 2001 From: Andrew White Date: Tue, 13 Jun 2023 12:46:17 -0400 Subject: [PATCH] More flexible types (#139) * Added future possible types * Added back defaults to read_doc too * Exported more types * Added more typing hints --- paperqa/__init__.py | 4 ++-- paperqa/docs.py | 7 ++----- paperqa/prompts.py | 7 ++++--- paperqa/readers.py | 4 ++-- paperqa/types.py | 7 ++++++- paperqa/version.py | 2 +- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/paperqa/__init__.py b/paperqa/__init__.py index 35692897..f2595c09 100644 --- a/paperqa/__init__.py +++ b/paperqa/__init__.py @@ -1,4 +1,4 @@ -from .docs import Answer, Docs, PromptCollection +from .docs import Answer, Docs, PromptCollection, Doc, Text from .version import __version__ -__all__ = ["Docs", "Answer", "PromptCollection", "__version__"] +__all__ = ["Docs", "Answer", "PromptCollection", "__version__", "Doc", "Text"] diff --git a/paperqa/docs.py b/paperqa/docs.py index 83f26b38..f2b81144 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -202,7 +202,7 @@ def add_texts( ): """Add chunked texts to the collection. This is useful if you have already chunked the texts yourself.""" if doc.dockey in self.docs: - raise ValueError("Document already in collection.") + raise ValueError(f"Document {doc.dockey} already in collection.") if len(texts) == 0: raise ValueError("No texts to add.") if doc.docname in self.docnames: @@ -261,9 +261,7 @@ async def adoc_match( query, k=k + len(self.deleted_dockeys) ) matched_docs = [self.docs[m.metadata["dockey"]] for m in matches] - chain = make_chain( - self.prompts.select, cast(BaseLanguageModel, self.summary_llm) - ) + chain = make_chain(self.prompts.select, cast(BaseLanguageModel, self.llm)) papers = [f"{d.docname}: {d.citation}" for d in matched_docs] result = await chain.arun( # type: ignore question=query, papers="\n".join(papers), callbacks=get_callbacks("filter") @@ -507,7 +505,6 @@ async def aquery( else: callbacks = get_callbacks("answer") qa_chain = make_chain(self.prompts.qa, self.llm) - print(self.prompts.qa) answer_text = await qa_chain.arun( context=answer.context, answer_length=answer.answer_length, diff --git a/paperqa/prompts.py b/paperqa/prompts.py index 8c4e492e..55596802 100644 --- a/paperqa/prompts.py +++ b/paperqa/prompts.py @@ -25,7 +25,7 @@ 'reply "I cannot answer". ' "For each part of your answer, indicate which sources most support it " "via valid citation markers at the end of sentences, like (Example2012). " - "Answer in an unbiased, comp rehensive, and scholarly tone. " + "Answer in an unbiased, comprehensive, and scholarly tone. " "If the question is subjective, provide an opinionated answer in the concluding 1-2 sentences. \n\n" "{context}\n" "Question: {question}\n" @@ -34,11 +34,12 @@ select_paper_prompt = PromptTemplate( input_variables=["question", "papers"], - template="Select papers to help answer the question below. " + template="Select papers that may help answer the question below. " "Papers are listed as $KEY: $PAPER_INFO. " "Return a list of keys, separated by commas. " 'Return "None", if no papers are applicable. ' - "Choose papers that are relevant, from reputable sources, and timely. \n\n" + "Choose papers that are relevant, from reputable sources, and timely " + "(if the question requires timely information). \n\n" "Question: {question}\n\n" "{papers}\n\n" "Selected keys:", diff --git a/paperqa/readers.py b/paperqa/readers.py index b188c9bd..1b977306 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -128,8 +128,8 @@ def parse_code_txt(path: Path, doc: Doc, chunk_chars: int, overlap: int) -> List def read_doc( path: Path, doc: Doc, - chunk_chars: int, - overlap: int, + chunk_chars: int = 3000, + overlap: int = 100, force_pypdf: bool = False, ) -> List[Text]: """Parse a document into chunks.""" diff --git a/paperqa/types.py b/paperqa/types.py index c5743167..a456354f 100644 --- a/paperqa/types.py +++ b/paperqa/types.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, Callable, List, Optional, Set, Union +from typing import Any, Callable, Dict, List, Optional, Set, Union from langchain.callbacks.base import BaseCallbackHandler from langchain.callbacks.manager import ( @@ -106,6 +106,11 @@ class Answer(BaseModel): dockey_filter: Optional[Set[DocKey]] = None summary_length: str = "about 100 words" answer_length: str = "about 100 words" + # these two below are for convenience + # and are not set. But you can set them + # if you want to use them. + cost: Optional[float] = None + token_counts: Optional[Dict[str, List[int]]] = None def __str__(self) -> str: """Return the answer as a string.""" diff --git a/paperqa/version.py b/paperqa/version.py index cfa8101f..6b2e3791 100644 --- a/paperqa/version.py +++ b/paperqa/version.py @@ -1 +1 @@ -__version__ = "3.0.0.dev1" +__version__ = "3.0.0.dev2"