diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py index 5084af91..ece22a03 100644 --- a/paperqa/agents/search.py +++ b/paperqa/agents/search.py @@ -343,7 +343,7 @@ async def maybe_get_manifest( async with await anyio.open_file(filename, mode="r") as file: content = await file.read() records = [DocDetails(**row) for row in csv.DictReader(StringIO(content))] - logger.info( + logger.debug( f"Found manifest file at {filename} and read {len(records)} records" " from it." ) @@ -433,8 +433,8 @@ async def get_directory_index( Args: index_name: Override on the name of the index. If unspecified, the default behavior is to generate the name from the input settings. - sync_index_w_directory: Sync the index with the directory (i.e. delete index - files if the file isn't in the source directory). + sync_index_w_directory: Sync the index (add or delete index files) with the + source paper directory. settings: Application settings. """ _settings = get_settings(settings) @@ -460,36 +460,39 @@ async def get_directory_index( manifest_file = directory / manifest_file metadata = await maybe_get_manifest(manifest_file) - valid_files = [ + valid_paper_dir_files = [ file async for file in ( directory.rglob("*") if _settings.index_recursively else directory.iterdir() ) if file.suffix in {".txt", ".pdf", ".html"} ] - if len(valid_files) > WARN_IF_INDEXING_MORE_THAN: + if len(valid_paper_dir_files) > WARN_IF_INDEXING_MORE_THAN: logger.warning( - f"Indexing {len(valid_files)} files. This may take a few minutes." + f"Indexing {len(valid_paper_dir_files)} files. This may take a few minutes." ) - index_files = await search_index.index_files + # NOTE: if the index was not previously built, this will be empty. + # Otherwise, it will not be empty + index_unique_file_paths: set[str] = set((await search_index.index_files).keys()) - if missing := (set(index_files.keys()) - {str(f) for f in valid_files}): + if extra_index_files := ( + index_unique_file_paths - {str(f) for f in valid_paper_dir_files} + ): if sync_index_w_directory: - for missing_file in missing: + for extra_file in extra_index_files: logger.warning( - f"[bold red]Removing {missing_file} from index.[/bold red]" + f"[bold red]Removing {extra_file} from index.[/bold red]" ) - await search_index.remove_from_index(missing_file) + await search_index.remove_from_index(extra_file) logger.warning("[bold red]Files removed![/bold red]") else: logger.warning( - "[bold red]Indexed files are missing from index folder" - f" ({directory}).[/bold red]" + f"[bold red]Indexed files {extra_index_files} are missing from paper" + f" folder ({directory}).[/bold red]" ) - logger.warning(f"[bold red]files: {missing}[/bold red]") async with anyio.create_task_group() as tg: - for file_path in valid_files: + for file_path in valid_paper_dir_files: if sync_index_w_directory: tg.start_soon( process_file, @@ -500,7 +503,7 @@ async def get_directory_index( _settings, ) else: - logger.debug(f"New file {file_path.name} found in directory.") + logger.debug(f"File {file_path.name} found in paper directory.") if search_index.changed: await search_index.save_index() diff --git a/tests/test_agents.py b/tests/test_agents.py index ebc241fe..dd8a1746 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -3,10 +3,13 @@ import itertools import json import re +import shutil +import tempfile import time from pathlib import Path from typing import Any, cast from unittest.mock import patch +from uuid import uuid4 import ldp.agent import pytest @@ -38,22 +41,56 @@ @pytest.mark.asyncio async def test_get_directory_index(agent_test_settings: Settings) -> None: - index = await get_directory_index(settings=agent_test_settings) - assert index.fields == [ - "file_location", - "body", - "title", - "year", - ], "Incorrect fields in index" - # paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt, - # but empty.txt fails to be added - path_to_id = await index.index_files - assert ( - sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4 - ), "Incorrect number of parsed index files" - results = await index.query(query="who is Frederick Bates?") - paper_dir = cast(Path, agent_test_settings.paper_directory) - assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())} + # Since agent_test_settings is used by other tests, and thus uses the same + # paper_directory as other tests, we use a tempdir so we can delete files + # without affecting concurrent tests + with tempfile.TemporaryDirectory() as tempdir: + shutil.copytree( + agent_test_settings.paper_directory, tempdir, dirs_exist_ok=True + ) + paper_dir = agent_test_settings.paper_directory = Path(tempdir) + + index_name = f"stub{uuid4()}" # Unique across test invocations + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert ( + index.index_name == index_name + ), "Index name should match its specification" + assert index.fields == [ + "file_location", + "body", + "title", + "year", + ], "Incorrect fields in index" + # paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt, + # but empty.txt fails to be added + path_to_id = await index.index_files + assert ( + sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4 + ), "Incorrect number of parsed index files" + results = await index.query(query="who is Frederick Bates?") + assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())} + + # Check getting the same index name will not reprocess files + with patch.object(Docs, "aadd") as mock_aadd: + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert len(await index.index_files) == len(path_to_id) + mock_aadd.assert_not_awaited(), "Expected we didn't re-add files" + + # Now we actually remove (but not add!) a file from the paper directory, + # and we still don't reprocess files + (paper_dir / "obama.txt").unlink() + with patch.object( + Docs, "aadd", autospec=True, side_effect=Docs.aadd + ) as mock_aadd: + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert len(await index.index_files) == len(path_to_id) - 1 + mock_aadd.assert_not_awaited(), "Expected we didn't re-add files" @pytest.mark.asyncio