From 72ef41547aed158f6e5fe25ffc503ff17aded4ab Mon Sep 17 00:00:00 2001
From: Andrew White <white.d.andrew@gmail.com>
Date: Tue, 13 Jun 2023 12:46:17 -0400
Subject: [PATCH] More flexible types (#139)

* Added future possible types

* Added back defaults to read_doc too

* Exported more types

* Added more typing hints
---
 paperqa/__init__.py | 4 ++--
 paperqa/docs.py     | 7 ++-----
 paperqa/prompts.py  | 7 ++++---
 paperqa/readers.py  | 4 ++--
 paperqa/types.py    | 7 ++++++-
 paperqa/version.py  | 2 +-
 6 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/paperqa/__init__.py b/paperqa/__init__.py
index 35692897..f2595c09 100644
--- a/paperqa/__init__.py
+++ b/paperqa/__init__.py
@@ -1,4 +1,4 @@
-from .docs import Answer, Docs, PromptCollection
+from .docs import Answer, Docs, PromptCollection, Doc, Text
 from .version import __version__
 
-__all__ = ["Docs", "Answer", "PromptCollection", "__version__"]
+__all__ = ["Docs", "Answer", "PromptCollection", "__version__", "Doc", "Text"]
diff --git a/paperqa/docs.py b/paperqa/docs.py
index 83f26b38..f2b81144 100644
--- a/paperqa/docs.py
+++ b/paperqa/docs.py
@@ -202,7 +202,7 @@ def add_texts(
     ):
         """Add chunked texts to the collection. This is useful if you have already chunked the texts yourself."""
         if doc.dockey in self.docs:
-            raise ValueError("Document already in collection.")
+            raise ValueError(f"Document {doc.dockey} already in collection.")
         if len(texts) == 0:
             raise ValueError("No texts to add.")
         if doc.docname in self.docnames:
@@ -261,9 +261,7 @@ async def adoc_match(
             query, k=k + len(self.deleted_dockeys)
         )
         matched_docs = [self.docs[m.metadata["dockey"]] for m in matches]
-        chain = make_chain(
-            self.prompts.select, cast(BaseLanguageModel, self.summary_llm)
-        )
+        chain = make_chain(self.prompts.select, cast(BaseLanguageModel, self.llm))
         papers = [f"{d.docname}: {d.citation}" for d in matched_docs]
         result = await chain.arun(  # type: ignore
             question=query, papers="\n".join(papers), callbacks=get_callbacks("filter")
@@ -507,7 +505,6 @@ async def aquery(
         else:
             callbacks = get_callbacks("answer")
             qa_chain = make_chain(self.prompts.qa, self.llm)
-            print(self.prompts.qa)
             answer_text = await qa_chain.arun(
                 context=answer.context,
                 answer_length=answer.answer_length,
diff --git a/paperqa/prompts.py b/paperqa/prompts.py
index 8c4e492e..55596802 100644
--- a/paperqa/prompts.py
+++ b/paperqa/prompts.py
@@ -25,7 +25,7 @@
     'reply "I cannot answer". '
     "For each part of your answer, indicate which sources most support it "
     "via valid citation markers at the end of sentences, like (Example2012). "
-    "Answer in an unbiased, comp rehensive, and scholarly tone. "
+    "Answer in an unbiased, comprehensive, and scholarly tone. "
     "If the question is subjective, provide an opinionated answer in the concluding 1-2 sentences. \n\n"
     "{context}\n"
     "Question: {question}\n"
@@ -34,11 +34,12 @@
 
 select_paper_prompt = PromptTemplate(
     input_variables=["question", "papers"],
-    template="Select papers to help answer the question below. "
+    template="Select papers that may help answer the question below. "
     "Papers are listed as $KEY: $PAPER_INFO. "
     "Return a list of keys, separated by commas. "
     'Return "None", if no papers are applicable. '
-    "Choose papers that are relevant, from reputable sources, and timely. \n\n"
+    "Choose papers that are relevant, from reputable sources, and timely "
+    "(if the question requires timely information). \n\n"
     "Question: {question}\n\n"
     "{papers}\n\n"
     "Selected keys:",
diff --git a/paperqa/readers.py b/paperqa/readers.py
index b188c9bd..1b977306 100644
--- a/paperqa/readers.py
+++ b/paperqa/readers.py
@@ -128,8 +128,8 @@ def parse_code_txt(path: Path, doc: Doc, chunk_chars: int, overlap: int) -> List
 def read_doc(
     path: Path,
     doc: Doc,
-    chunk_chars: int,
-    overlap: int,
+    chunk_chars: int = 3000,
+    overlap: int = 100,
     force_pypdf: bool = False,
 ) -> List[Text]:
     """Parse a document into chunks."""
diff --git a/paperqa/types.py b/paperqa/types.py
index c5743167..a456354f 100644
--- a/paperqa/types.py
+++ b/paperqa/types.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, Callable, List, Optional, Set, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.manager import (
@@ -106,6 +106,11 @@ class Answer(BaseModel):
     dockey_filter: Optional[Set[DocKey]] = None
     summary_length: str = "about 100 words"
     answer_length: str = "about 100 words"
+    # these two below are for convenience
+    # and are not set. But you can set them
+    # if you want to use them.
+    cost: Optional[float] = None
+    token_counts: Optional[Dict[str, List[int]]] = None
 
     def __str__(self) -> str:
         """Return the answer as a string."""
diff --git a/paperqa/version.py b/paperqa/version.py
index cfa8101f..6b2e3791 100644
--- a/paperqa/version.py
+++ b/paperqa/version.py
@@ -1 +1 @@
-__version__ = "3.0.0.dev1"
+__version__ = "3.0.0.dev2"