From 1c4bfb8c5f8e429b2409762b2c7049cd0004b3b0 Mon Sep 17 00:00:00 2001 From: Chad Norvell Date: Wed, 29 Nov 2023 02:12:32 -0800 Subject: [PATCH] langchain[patch]: Mathpix PDF loader supports arbitrary extra params (#13950) - **Description:** Support providing whatever extra parameters you want to the Mathpix PDF loader API request. - **Issue:** #12773 - **Dependencies:** None --------- Co-authored-by: Bagatur --- libs/langchain/langchain/document_loaders/pdf.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index e013fc0e446a0..ceb7d292957d8 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -372,6 +372,7 @@ def __init__( processed_file_format: str = "md", max_wait_time_seconds: int = 500, should_clean_pdf: bool = False, + extra_request_data: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> None: """Initialize with a file path. @@ -382,6 +383,7 @@ def __init__( max_wait_time_seconds: a maximum time to wait for the response from the server. Default is 500. should_clean_pdf: a flag to clean the PDF file. Default is False. + extra_request_data: Additional request data. **kwargs: additional keyword arguments. """ self.mathpix_api_key = get_from_dict_or_env( @@ -392,6 +394,9 @@ def __init__( ) super().__init__(file_path, **kwargs) self.processed_file_format = processed_file_format + self.extra_request_data = ( + extra_request_data if extra_request_data is not None else {} + ) self.max_wait_time_seconds = max_wait_time_seconds self.should_clean_pdf = should_clean_pdf @@ -405,7 +410,10 @@ def url(self) -> str: @property def data(self) -> dict: - options = {"conversion_formats": {self.processed_file_format: True}} + options = { + "conversion_formats": {self.processed_file_format: True}, + **self.extra_request_data, + } return {"options_json": json.dumps(options)} def send_pdf(self) -> str: