diff --git a/autollm/__init__.py b/autollm/__init__.py index d9402ef9..9af96527 100644 --- a/autollm/__init__.py +++ b/autollm/__init__.py @@ -4,7 +4,7 @@ and vector databases, along with various utility functions. """ -__version__ = '0.0.10' +__version__ = '0.0.11' __author__ = 'safevideo' __license__ = 'AGPL-3.0' diff --git a/autollm/utils/document_reading.py b/autollm/utils/document_reading.py index aff91137..ca58f2c2 100644 --- a/autollm/utils/document_reading.py +++ b/autollm/utils/document_reading.py @@ -17,6 +17,7 @@ def read_files_as_documents( input_files: Optional[List] = None, filename_as_id: bool = True, recursive: bool = True, + required_exts: Optional[List[str]] = None, read_as_single_doc: bool = True, **kwargs) -> Sequence[Document]: """ @@ -27,6 +28,7 @@ def read_files_as_documents( input_files (List): List of file paths. filename_as_id (bool): Whether to use the filename as the document id. recursive (bool): Whether to recursively search for files in the input directory. + required_exts (Optional[List[str]]): List of required extensions. read_as_single_doc (bool): If True, read each markdown as a single document. Returns: @@ -42,6 +44,7 @@ def read_files_as_documents( input_files=input_files, filename_as_id=filename_as_id, recursive=recursive, + required_exts=required_exts, **kwargs) # Read and process the documents @@ -51,14 +54,17 @@ def read_files_as_documents( return documents -def read_github_repo_as_documents(git_repo_url: str, - relative_folder_path: Optional[str] = None) -> Sequence[Document]: +def read_github_repo_as_documents( + git_repo_url: str, + relative_folder_path: Optional[str] = None, + required_exts: Optional[List[str]] = None) -> Sequence[Document]: """ A document provider that fetches documents from a specific folder within a GitHub repository. Parameters: git_repo_url (str): The URL of the GitHub repository. relative_folder_path (str, optional): The relative path from the repo root to the folder containing documents. + required_exts (Optional[List[str]]): List of required extensions. Returns: Sequence[Document]: A sequence of Document objects. @@ -78,7 +84,7 @@ def read_github_repo_as_documents(git_repo_url: str, docs_path = temp_dir if relative_folder_path is None else (temp_dir / Path(relative_folder_path)) # Read and process the documents - documents = read_files_as_documents(input_dir=str(docs_path)) + documents = read_files_as_documents(input_dir=str(docs_path), required_exts=required_exts) # Logging (assuming logger is configured) logger.info(f"Operations complete, deleting temporary directory {temp_dir}..") finally: