Skip to content

Commit

Permalink
Made variable names more intuitive, and added type hints
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbraza committed Sep 25, 2024
1 parent d512860 commit 5ed0a39
Showing 1 changed file with 18 additions and 15 deletions.
33 changes: 18 additions & 15 deletions paperqa/agents/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,8 @@ async def get_directory_index(
Args:
index_name: Override on the name of the index. If unspecified, the default
behavior is to generate the name from the input settings.
sync_index_w_directory: Sync the index with the directory (i.e. delete index
files if the file isn't in the source directory).
sync_index_w_directory: Sync the index (add or delete index files) with the
source paper directory.
settings: Application settings.
"""
_settings = get_settings(settings)
Expand All @@ -460,36 +460,39 @@ async def get_directory_index(
manifest_file = directory / manifest_file

metadata = await maybe_get_manifest(manifest_file)
valid_files = [
valid_paper_dir_files = [
file
async for file in (
directory.rglob("*") if _settings.index_recursively else directory.iterdir()
)
if file.suffix in {".txt", ".pdf", ".html"}
]
if len(valid_files) > WARN_IF_INDEXING_MORE_THAN:
if len(valid_paper_dir_files) > WARN_IF_INDEXING_MORE_THAN:
logger.warning(
f"Indexing {len(valid_files)} files. This may take a few minutes."
f"Indexing {len(valid_paper_dir_files)} files. This may take a few minutes."
)
index_files = await search_index.index_files
# NOTE: if the index was not previously built, this will be empty.
# Otherwise, it will not be empty
index_unique_file_paths: set[str] = set((await search_index.index_files).keys())

if missing := (set(index_files.keys()) - {str(f) for f in valid_files}):
if extra_index_files := (
index_unique_file_paths - {str(f) for f in valid_paper_dir_files}
):
if sync_index_w_directory:
for missing_file in missing:
for extra_file in extra_index_files:
logger.warning(
f"[bold red]Removing {missing_file} from index.[/bold red]"
f"[bold red]Removing {extra_file} from index.[/bold red]"
)
await search_index.remove_from_index(missing_file)
await search_index.remove_from_index(extra_file)
logger.warning("[bold red]Files removed![/bold red]")
else:
logger.warning(
"[bold red]Indexed files are missing from index folder"
f" ({directory}).[/bold red]"
f"[bold red]Indexed files {extra_index_files} are missing from paper"
f" folder ({directory}).[/bold red]"
)
logger.warning(f"[bold red]files: {missing}[/bold red]")

async with anyio.create_task_group() as tg:
for file_path in valid_files:
for file_path in valid_paper_dir_files:
if sync_index_w_directory:
tg.start_soon(
process_file,
Expand All @@ -500,7 +503,7 @@ async def get_directory_index(
_settings,
)
else:
logger.debug(f"New file {file_path.name} found in directory.")
logger.debug(f"File {file_path.name} found in paper directory.")

if search_index.changed:
await search_index.save_index()
Expand Down

0 comments on commit 5ed0a39

Please sign in to comment.