Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleaning up and testing get_directory_index #483

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 19 additions & 16 deletions paperqa/agents/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ async def maybe_get_manifest(
async with await anyio.open_file(filename, mode="r") as file:
content = await file.read()
records = [DocDetails(**row) for row in csv.DictReader(StringIO(content))]
logger.info(
logger.debug(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a bit noisy, so downgraded its level

f"Found manifest file at {filename} and read {len(records)} records"
" from it."
)
Expand Down Expand Up @@ -433,8 +433,8 @@ async def get_directory_index(
Args:
index_name: Override on the name of the index. If unspecified, the default
behavior is to generate the name from the input settings.
sync_index_w_directory: Sync the index with the directory (i.e. delete index
files if the file isn't in the source directory).
sync_index_w_directory: Sync the index (add or delete index files) with the
source paper directory.
settings: Application settings.
"""
_settings = get_settings(settings)
Expand All @@ -460,36 +460,39 @@ async def get_directory_index(
manifest_file = directory / manifest_file

metadata = await maybe_get_manifest(manifest_file)
valid_files = [
valid_paper_dir_files = [
file
async for file in (
directory.rglob("*") if _settings.index_recursively else directory.iterdir()
)
if file.suffix in {".txt", ".pdf", ".html"}
]
if len(valid_files) > WARN_IF_INDEXING_MORE_THAN:
if len(valid_paper_dir_files) > WARN_IF_INDEXING_MORE_THAN:
logger.warning(
f"Indexing {len(valid_files)} files. This may take a few minutes."
f"Indexing {len(valid_paper_dir_files)} files. This may take a few minutes."
)
index_files = await search_index.index_files
# NOTE: if the index was not previously built, this will be empty.
# Otherwise, it will not be empty
index_unique_file_paths: set[str] = set((await search_index.index_files).keys())

if missing := (set(index_files.keys()) - {str(f) for f in valid_files}):
if extra_index_files := (
index_unique_file_paths - {str(f) for f in valid_paper_dir_files}
):
if sync_index_w_directory:
for missing_file in missing:
for extra_file in extra_index_files:
logger.warning(
f"[bold red]Removing {missing_file} from index.[/bold red]"
f"[bold red]Removing {extra_file} from index.[/bold red]"
)
await search_index.remove_from_index(missing_file)
await search_index.remove_from_index(extra_file)
logger.warning("[bold red]Files removed![/bold red]")
else:
logger.warning(
"[bold red]Indexed files are missing from index folder"
f" ({directory}).[/bold red]"
f"[bold red]Indexed files {extra_index_files} are missing from paper"
f" folder ({directory}).[/bold red]"
)
logger.warning(f"[bold red]files: {missing}[/bold red]")

async with anyio.create_task_group() as tg:
for file_path in valid_files:
for file_path in valid_paper_dir_files:
if sync_index_w_directory:
tg.start_soon(
process_file,
Expand All @@ -500,7 +503,7 @@ async def get_directory_index(
_settings,
)
else:
logger.debug(f"New file {file_path.name} found in directory.")
logger.debug(f"File {file_path.name} found in paper directory.")

if search_index.changed:
await search_index.save_index()
Expand Down
69 changes: 53 additions & 16 deletions tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import itertools
import json
import re
import shutil
import tempfile
import time
from pathlib import Path
from typing import Any, cast
from unittest.mock import patch
from uuid import uuid4

import ldp.agent
import pytest
Expand Down Expand Up @@ -38,22 +41,56 @@

@pytest.mark.asyncio
async def test_get_directory_index(agent_test_settings: Settings) -> None:
index = await get_directory_index(settings=agent_test_settings)
assert index.fields == [
"file_location",
"body",
"title",
"year",
], "Incorrect fields in index"
# paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt,
# but empty.txt fails to be added
path_to_id = await index.index_files
assert (
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4
), "Incorrect number of parsed index files"
results = await index.query(query="who is Frederick Bates?")
paper_dir = cast(Path, agent_test_settings.paper_directory)
assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())}
# Since agent_test_settings is used by other tests, and thus uses the same
# paper_directory as other tests, we use a tempdir so we can delete files
# without affecting concurrent tests
with tempfile.TemporaryDirectory() as tempdir:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have thought this fixture would handle that -- did you see deleting files affecting other tests?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah good question. I just expanded the explanatory comment to address this:

    # Since agent_test_settings is used by other tests, and thus uses the same
    # paper_directory as other tests, we use a tempdir so we can delete files
    # without affecting concurrent tests

shutil.copytree(
agent_test_settings.paper_directory, tempdir, dirs_exist_ok=True
)
paper_dir = agent_test_settings.paper_directory = Path(tempdir)

index_name = f"stub{uuid4()}" # Unique across test invocations
index = await get_directory_index(
index_name=index_name, settings=agent_test_settings
)
assert (
index.index_name == index_name
), "Index name should match its specification"
assert index.fields == [
"file_location",
"body",
"title",
"year",
], "Incorrect fields in index"
# paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt,
# but empty.txt fails to be added
path_to_id = await index.index_files
assert (
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4
), "Incorrect number of parsed index files"
results = await index.query(query="who is Frederick Bates?")
assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())}

# Check getting the same index name will not reprocess files
with patch.object(Docs, "aadd") as mock_aadd:
index = await get_directory_index(
index_name=index_name, settings=agent_test_settings
)
assert len(await index.index_files) == len(path_to_id)
mock_aadd.assert_not_awaited(), "Expected we didn't re-add files"

# Now we actually remove (but not add!) a file from the paper directory,
# and we still don't reprocess files
(paper_dir / "obama.txt").unlink()
with patch.object(
Docs, "aadd", autospec=True, side_effect=Docs.aadd
) as mock_aadd:
index = await get_directory_index(
index_name=index_name, settings=agent_test_settings
)
assert len(await index.index_files) == len(path_to_id) - 1
mock_aadd.assert_not_awaited(), "Expected we didn't re-add files"


@pytest.mark.asyncio
Expand Down
Loading