From 96e18c9eeb1d40a3868693639962fe55c92b24fd Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 25 Sep 2024 13:58:32 -0700 Subject: [PATCH 1/5] Testing with custom index name --- tests/test_agents.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index ebc241fe..d89de2b6 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any, cast from unittest.mock import patch +from uuid import uuid4 import ldp.agent import pytest @@ -38,7 +39,11 @@ @pytest.mark.asyncio async def test_get_directory_index(agent_test_settings: Settings) -> None: - index = await get_directory_index(settings=agent_test_settings) + index_name = f"stub{uuid4()}" # Unique across tests + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert index.index_name == index_name, "Index name should match its specification" assert index.fields == [ "file_location", "body", From 519aab8775e48eaad9f0f6b6ccba2fb3399bb523 Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 25 Sep 2024 15:35:49 -0700 Subject: [PATCH 2/5] Testing resume-ability of get_directory_index --- tests/test_agents.py | 70 +++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index d89de2b6..9259329b 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -3,6 +3,8 @@ import itertools import json import re +import shutil +import tempfile import time from pathlib import Path from typing import Any, cast @@ -39,26 +41,54 @@ @pytest.mark.asyncio async def test_get_directory_index(agent_test_settings: Settings) -> None: - index_name = f"stub{uuid4()}" # Unique across tests - index = await get_directory_index( - index_name=index_name, settings=agent_test_settings - ) - assert index.index_name == index_name, "Index name should match its specification" - assert index.fields == [ - "file_location", - "body", - "title", - "year", - ], "Incorrect fields in index" - # paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt, - # but empty.txt fails to be added - path_to_id = await index.index_files - assert ( - sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4 - ), "Incorrect number of parsed index files" - results = await index.query(query="who is Frederick Bates?") - paper_dir = cast(Path, agent_test_settings.paper_directory) - assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())} + # Use a tempdir here so we can delete files without affecting concurrent tests + with tempfile.TemporaryDirectory() as tempdir: + shutil.copytree( + agent_test_settings.paper_directory, tempdir, dirs_exist_ok=True + ) + paper_dir = agent_test_settings.paper_directory = Path(tempdir) + + index_name = f"stub{uuid4()}" # Unique across test invocations + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert ( + index.index_name == index_name + ), "Index name should match its specification" + assert index.fields == [ + "file_location", + "body", + "title", + "year", + ], "Incorrect fields in index" + # paper.pdf + empty.txt + flag_day.html + bates.txt + obama.txt, + # but empty.txt fails to be added + path_to_id = await index.index_files + assert ( + sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 4 + ), "Incorrect number of parsed index files" + results = await index.query(query="who is Frederick Bates?") + assert results[0].docs.keys() == {md5sum((paper_dir / "bates.txt").absolute())} + + # Check getting the same index name will not reprocess files + with patch.object(Docs, "aadd") as mock_aadd: + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert len(await index.index_files) == len(path_to_id) + mock_aadd.assert_not_awaited(), "Expected we didn't re-add files" + + # Now we actually remove (but not add!) a file from the paper directory, + # and we still don't reprocess files + (paper_dir / "obama.txt").unlink() + with patch.object( + Docs, "aadd", autospec=True, side_effect=Docs.aadd + ) as mock_aadd: + index = await get_directory_index( + index_name=index_name, settings=agent_test_settings + ) + assert len(await index.index_files) == len(path_to_id) - 1 + mock_aadd.assert_not_awaited(), "Expected we didn't re-add files" @pytest.mark.asyncio From a5399aa1626c6a3df2767c0322c062cf2832902f Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 25 Sep 2024 15:38:28 -0700 Subject: [PATCH 3/5] Made variable names more intuitive, and added type hints --- paperqa/agents/search.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py index 5084af91..e267584f 100644 --- a/paperqa/agents/search.py +++ b/paperqa/agents/search.py @@ -433,8 +433,8 @@ async def get_directory_index( Args: index_name: Override on the name of the index. If unspecified, the default behavior is to generate the name from the input settings. - sync_index_w_directory: Sync the index with the directory (i.e. delete index - files if the file isn't in the source directory). + sync_index_w_directory: Sync the index (add or delete index files) with the + source paper directory. settings: Application settings. """ _settings = get_settings(settings) @@ -460,36 +460,39 @@ async def get_directory_index( manifest_file = directory / manifest_file metadata = await maybe_get_manifest(manifest_file) - valid_files = [ + valid_paper_dir_files = [ file async for file in ( directory.rglob("*") if _settings.index_recursively else directory.iterdir() ) if file.suffix in {".txt", ".pdf", ".html"} ] - if len(valid_files) > WARN_IF_INDEXING_MORE_THAN: + if len(valid_paper_dir_files) > WARN_IF_INDEXING_MORE_THAN: logger.warning( - f"Indexing {len(valid_files)} files. This may take a few minutes." + f"Indexing {len(valid_paper_dir_files)} files. This may take a few minutes." ) - index_files = await search_index.index_files + # NOTE: if the index was not previously built, this will be empty. + # Otherwise, it will not be empty + index_unique_file_paths: set[str] = set((await search_index.index_files).keys()) - if missing := (set(index_files.keys()) - {str(f) for f in valid_files}): + if extra_index_files := ( + index_unique_file_paths - {str(f) for f in valid_paper_dir_files} + ): if sync_index_w_directory: - for missing_file in missing: + for extra_file in extra_index_files: logger.warning( - f"[bold red]Removing {missing_file} from index.[/bold red]" + f"[bold red]Removing {extra_file} from index.[/bold red]" ) - await search_index.remove_from_index(missing_file) + await search_index.remove_from_index(extra_file) logger.warning("[bold red]Files removed![/bold red]") else: logger.warning( - "[bold red]Indexed files are missing from index folder" - f" ({directory}).[/bold red]" + f"[bold red]Indexed files {extra_index_files} are missing from paper" + f" folder ({directory}).[/bold red]" ) - logger.warning(f"[bold red]files: {missing}[/bold red]") async with anyio.create_task_group() as tg: - for file_path in valid_files: + for file_path in valid_paper_dir_files: if sync_index_w_directory: tg.start_soon( process_file, @@ -500,7 +503,7 @@ async def get_directory_index( _settings, ) else: - logger.debug(f"New file {file_path.name} found in directory.") + logger.debug(f"File {file_path.name} found in paper directory.") if search_index.changed: await search_index.save_index() From f08d2f30282dc964f3948aec10ce1c4beff0f5ff Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 25 Sep 2024 12:14:54 -0700 Subject: [PATCH 4/5] Converted maybe_get_manifest to debug log to avoid noisy logs --- paperqa/agents/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py index e267584f..ece22a03 100644 --- a/paperqa/agents/search.py +++ b/paperqa/agents/search.py @@ -343,7 +343,7 @@ async def maybe_get_manifest( async with await anyio.open_file(filename, mode="r") as file: content = await file.read() records = [DocDetails(**row) for row in csv.DictReader(StringIO(content))] - logger.info( + logger.debug( f"Found manifest file at {filename} and read {len(records)} records" " from it." ) From 808f569949845766e5fbd9fdde5c1756bbcf396a Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 25 Sep 2024 16:48:38 -0700 Subject: [PATCH 5/5] Expanded comment explaining why TemporaryDirectory in test_get_directory_index --- tests/test_agents.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index 9259329b..dd8a1746 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -41,7 +41,9 @@ @pytest.mark.asyncio async def test_get_directory_index(agent_test_settings: Settings) -> None: - # Use a tempdir here so we can delete files without affecting concurrent tests + # Since agent_test_settings is used by other tests, and thus uses the same + # paper_directory as other tests, we use a tempdir so we can delete files + # without affecting concurrent tests with tempfile.TemporaryDirectory() as tempdir: shutil.copytree( agent_test_settings.paper_directory, tempdir, dirs_exist_ok=True