Skip to content
This repository has been archived by the owner on Sep 12, 2024. It is now read-only.

Commit

Permalink
add required extensions option to document readers (#50)
Browse files Browse the repository at this point in the history
* add required exts option to document readers

* update autollm version
  • Loading branch information
SeeknnDestroy authored Oct 28, 2023
1 parent 9fea5f3 commit 0783d75
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
2 changes: 1 addition & 1 deletion autollm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
and vector databases, along with various utility functions.
"""

__version__ = '0.0.10'
__version__ = '0.0.11'
__author__ = 'safevideo'
__license__ = 'AGPL-3.0'

Expand Down
12 changes: 9 additions & 3 deletions autollm/utils/document_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def read_files_as_documents(
input_files: Optional[List] = None,
filename_as_id: bool = True,
recursive: bool = True,
required_exts: Optional[List[str]] = None,
read_as_single_doc: bool = True,
**kwargs) -> Sequence[Document]:
"""
Expand All @@ -27,6 +28,7 @@ def read_files_as_documents(
input_files (List): List of file paths.
filename_as_id (bool): Whether to use the filename as the document id.
recursive (bool): Whether to recursively search for files in the input directory.
required_exts (Optional[List[str]]): List of required extensions.
read_as_single_doc (bool): If True, read each markdown as a single document.
Returns:
Expand All @@ -42,6 +44,7 @@ def read_files_as_documents(
input_files=input_files,
filename_as_id=filename_as_id,
recursive=recursive,
required_exts=required_exts,
**kwargs)

# Read and process the documents
Expand All @@ -51,14 +54,17 @@ def read_files_as_documents(
return documents


def read_github_repo_as_documents(git_repo_url: str,
relative_folder_path: Optional[str] = None) -> Sequence[Document]:
def read_github_repo_as_documents(
git_repo_url: str,
relative_folder_path: Optional[str] = None,
required_exts: Optional[List[str]] = None) -> Sequence[Document]:
"""
A document provider that fetches documents from a specific folder within a GitHub repository.
Parameters:
git_repo_url (str): The URL of the GitHub repository.
relative_folder_path (str, optional): The relative path from the repo root to the folder containing documents.
required_exts (Optional[List[str]]): List of required extensions.
Returns:
Sequence[Document]: A sequence of Document objects.
Expand All @@ -78,7 +84,7 @@ def read_github_repo_as_documents(git_repo_url: str,
docs_path = temp_dir if relative_folder_path is None else (temp_dir / Path(relative_folder_path))

# Read and process the documents
documents = read_files_as_documents(input_dir=str(docs_path))
documents = read_files_as_documents(input_dir=str(docs_path), required_exts=required_exts)
# Logging (assuming logger is configured)
logger.info(f"Operations complete, deleting temporary directory {temp_dir}..")
finally:
Expand Down

0 comments on commit 0783d75

Please sign in to comment.