From f7e1f1248fabc1843ac8b2d883681f83c4ac39fa Mon Sep 17 00:00:00 2001 From: Li Wan <49334982+wanliAlex@users.noreply.github.com> Date: Tue, 8 Oct 2024 13:29:41 +1100 Subject: [PATCH 1/7] Fix API tests branch issue (#992) Fix a bug where API tests are not running on the correct Marqo Branch. --- .github/workflows/arm64_docker_marqo.yml | 3 ++- .github/workflows/cpu_docker_marqo.yml | 3 ++- .github/workflows/cpu_local_marqo.yml | 3 ++- .github/workflows/cuda_docker_marqo.yml | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/arm64_docker_marqo.yml b/.github/workflows/arm64_docker_marqo.yml index a00b619af..c447fb187 100644 --- a/.github/workflows/arm64_docker_marqo.yml +++ b/.github/workflows/arm64_docker_marqo.yml @@ -116,7 +116,8 @@ jobs: - name: Run Integration Tests - ARM64 Docker Marqo run: | - export MQ_API_TEST_BRANCH=$(echo "${GITHUB_REF}" | cut -d'/' -f3-) + export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + echo "$MQ_API_TEST_BRANCH" CUSTOM_TEST_IMG="${{ github.event.inputs.image_to_test }}" export MQ_API_TEST_IMG=${CUSTOM_TEST_IMG:-"marqo_docker_0"} tox -e py3-docker_marqo diff --git a/.github/workflows/cpu_docker_marqo.yml b/.github/workflows/cpu_docker_marqo.yml index 911ccf815..ba2221d44 100644 --- a/.github/workflows/cpu_docker_marqo.yml +++ b/.github/workflows/cpu_docker_marqo.yml @@ -115,7 +115,8 @@ jobs: - name: Run Integration Tests - CPU Docker Marqo run: | - export MQ_API_TEST_BRANCH=$(echo "${GITHUB_REF}" | cut -d'/' -f3-) + export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + echo "$MQ_API_TEST_BRANCH" CUSTOM_TEST_IMG="${{ github.event.inputs.image_to_test }}" export MQ_API_TEST_IMG=${CUSTOM_TEST_IMG:-"marqo_docker_0"} tox -e py3-docker_marqo diff --git a/.github/workflows/cpu_local_marqo.yml b/.github/workflows/cpu_local_marqo.yml index 1b2ec2387..94eece8ec 100644 --- a/.github/workflows/cpu_local_marqo.yml +++ b/.github/workflows/cpu_local_marqo.yml @@ -120,7 +120,8 @@ jobs: - name: Run Integration Tests - CPU Local Marqo run: | - export MQ_API_TEST_BRANCH=$(echo "${GITHUB_REF}" | cut -d'/' -f3-) + export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + echo "$MQ_API_TEST_BRANCH" CUSTOM_TEST_IMG="${{ github.event.inputs.image_to_test }}" export MQ_API_TEST_IMG=${CUSTOM_TEST_IMG:-"marqo_docker_0"} tox -e py3-local_marqo diff --git a/.github/workflows/cuda_docker_marqo.yml b/.github/workflows/cuda_docker_marqo.yml index bc2a7e9e3..5a0d3604b 100644 --- a/.github/workflows/cuda_docker_marqo.yml +++ b/.github/workflows/cuda_docker_marqo.yml @@ -114,7 +114,8 @@ jobs: - name: Run CUDA Integration Tests - CUDA Docker Marqo run: | - export MQ_API_TEST_BRANCH=$(echo "${GITHUB_REF}" | cut -d'/' -f3-) + export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + echo "$MQ_API_TEST_BRANCH" CUSTOM_TEST_IMG="${{ github.event.inputs.image_to_test }}" export MQ_API_TEST_IMG=${CUSTOM_TEST_IMG:-"marqo_docker_0"} tox -e py3-cuda_docker_marqo From 2d87f1979c13df47efe708908cd8c91ef8c7a57e Mon Sep 17 00:00:00 2001 From: Yihan Zhao Date: Wed, 9 Oct 2024 16:37:28 +1100 Subject: [PATCH 2/7] Fix the api tests by emptying the index cache after updating the index (#991) --- src/marqo/core/embed/embed.py | 2 +- src/marqo/core/search/hybrid_search.py | 2 +- src/marqo/core/search/recommender.py | 2 +- .../semi_structured_add_document_handler.py | 5 ++ .../s2_inference/multimodal_model_load.py | 2 +- src/marqo/tensor_search/index_meta_cache.py | 25 ++++--- src/marqo/tensor_search/tensor_search.py | 10 +-- .../document/test_partial_document_update.py | 22 +++--- tests/core/monitoring/test_monitoring.py | 14 ++-- tests/core/search/test_recommender.py | 4 +- tests/marqo_test.py | 8 +-- tests/s2_inference/test_generic_clip_model.py | 16 ++--- tests/s2_inference/test_generic_model.py | 2 +- .../test_search_regression.py | 4 +- .../test_add_documents_combined.py | 24 +++---- .../test_add_documents_semi_structured.py | 62 ++++++++-------- ...dd_documents_semi_structured_add_fields.py | 10 ++- .../test_add_documents_structured.py | 62 ++++++++-------- .../test_add_documents_unstructured.py | 62 ++++++++-------- .../integ_tests/test_custom_vector_field.py | 40 +++++------ .../integ_tests/test_delete_documents.py | 18 ++--- .../integ_tests/test_dict_score_modifiers.py | 20 +++--- tests/tensor_search/integ_tests/test_embed.py | 4 +- .../integ_tests/test_get_document.py | 6 +- .../integ_tests/test_get_documents_by_ids.py | 8 +-- .../integ_tests/test_hybrid_search.py | 35 +++++---- .../integ_tests/test_no_model.py | 6 +- .../integ_tests/test_search_combined.py | 26 +++---- .../test_search_semi_structured.py | 72 +++++++++---------- .../integ_tests/test_search_structured.py | 50 ++++++------- .../integ_tests/test_search_unstructured.py | 72 +++++++++---------- ...test_add_documents_use_existing_tensors.py | 72 +++++++++---------- .../test_context_vectors_search.py | 4 +- tests/tensor_search/test_default_device.py | 30 ++++---- .../test_image_download_headers.py | 6 +- .../tensor_search/test_image_preprocessing.py | 8 +-- tests/tensor_search/test_index_meta_cache.py | 36 +++++----- tests/tensor_search/test_lexical_search.py | 36 +++++----- tests/tensor_search/test_model_auth.py | 48 ++++++------- tests/tensor_search/test_model_auth_cuda.py | 4 +- .../test_multimodal_tensor_combination.py | 46 ++++++------ tests/tensor_search/test_pagination.py | 10 +-- tests/tensor_search/test_prefix.py | 18 ++--- tests/tensor_search/test_search.py | 4 +- .../test_searchable_attributes.py | 2 +- 45 files changed, 511 insertions(+), 508 deletions(-) diff --git a/src/marqo/core/embed/embed.py b/src/marqo/core/embed/embed.py index dc96f2318..29d6fcf54 100644 --- a/src/marqo/core/embed/embed.py +++ b/src/marqo/core/embed/embed.py @@ -72,7 +72,7 @@ def embed_content( # Generate input for the vectorise pipeline (Preprocessing) RequestMetricsStore.for_request().start("embed.query_preprocessing") - marqo_index = index_meta_cache.get_index(config=temp_config, index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=temp_config.index_management, index_name=index_name) # Transform content to list if it is not already if isinstance(content, List): diff --git a/src/marqo/core/search/hybrid_search.py b/src/marqo/core/search/hybrid_search.py index c6b496eb2..3bc2e4ead 100644 --- a/src/marqo/core/search/hybrid_search.py +++ b/src/marqo/core/search/hybrid_search.py @@ -82,7 +82,7 @@ def search( RequestMetricsStore.for_request().start("search.hybrid.processing_before_vespa") - marqo_index = index_meta_cache.get_index(config=config, index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=config.index_management, index_name=index_name) # Version checks (different for structured and unstructured) marqo_index_version = marqo_index.parsed_marqo_version() diff --git a/src/marqo/core/search/recommender.py b/src/marqo/core/search/recommender.py index f0ffb7875..fbf5fd8fe 100644 --- a/src/marqo/core/search/recommender.py +++ b/src/marqo/core/search/recommender.py @@ -80,7 +80,7 @@ def recommend(self, if len(documents) == 0: raise InvalidArgumentError('No documents with non-zero weight provided') - marqo_index = index_meta_cache.get_index(config.Config(self.vespa_client), index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=self.index_management, index_name=index_name) if interpolation_method is None: interpolation_method = self._get_default_interpolation_method(marqo_index) diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py index ee62cd1ef..e7728dc3c 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py @@ -66,6 +66,11 @@ def _pre_persist_to_vespa(self): if self.should_update_index: with RequestMetricsStore.for_request().time("add_documents.update_index"): self.index_management.update_index(self.marqo_index) + # Force fresh this index in the index cache to make sure the following search requests get the latest index + # TODO this is a temporary solution to fix the consistency issue for single instance Marqo (used extensively + # in api-tests and integration tests). Find a better way to solve consistency issue for Marqo clusters + from marqo.tensor_search import index_meta_cache + index_meta_cache.get_index(self.index_management, self.marqo_index.name, force_refresh=True) def _add_lexical_field_to_index(self, field_name): if field_name in self.marqo_index.field_map: diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 5fc9d03a4..173630c22 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -126,7 +126,7 @@ def __init__(self, model): self.model = model def encode(self, content, modality, **kwargs): - return self.model.encode(content) + return self.model.encode(content, **kwargs) @contextmanager diff --git a/src/marqo/tensor_search/index_meta_cache.py b/src/marqo/tensor_search/index_meta_cache.py index 551f8ded8..fca559abd 100644 --- a/src/marqo/tensor_search/index_meta_cache.py +++ b/src/marqo/tensor_search/index_meta_cache.py @@ -25,7 +25,7 @@ # Because it is non thread safe, there is a chance multiple threads push out # multiple refresh requests at the same. It isn't a critical problem if that # happens. -cache_refresh_interval: int = 10 # seconds +cache_refresh_interval: int = 1 # seconds cache_refresh_log_interval: int = 60 cache_refresh_last_logged_time: float = 0 refresh_thread = None @@ -41,22 +41,27 @@ def get_cache() -> Dict[str, MarqoIndex]: return index_info_cache -def get_index(config: Config, index_name: str, force_refresh=False) -> MarqoIndex: +def get_index(index_management: IndexManagement, index_name: str, force_refresh=False) -> MarqoIndex: """ Get an index. Args: + index_management: IndexManagement object to load the index if not found in cache + index_name (str): Name of the index to retrieve. force_refresh: Get index from Vespa even if already in cache. If False, Vespa is called only if index is not found in cache. Returns: + The latest index if the index is not found in cache or force_refresh flag is True. Otherwise, the cached index + Raises: + IndexNotFoundError: If index is not found in cache. """ # Make sure refresh thread is running - _check_refresh_thread(config) + _check_refresh_thread(index_management) if force_refresh or index_name not in index_info_cache: - _refresh_index(config, index_name) + _refresh_index(index_management, index_name) if index_name in index_info_cache: return index_info_cache[index_name] @@ -65,11 +70,10 @@ def get_index(config: Config, index_name: str, force_refresh=False) -> MarqoInde raise exceptions.IndexNotFoundError(f"Index {index_name} not found") -def _refresh_index(config: Config, index_name: str) -> None: +def _refresh_index(index_management: IndexManagement, index_name: str) -> None: """ Refresh cache for a specific index """ - index_management = IndexManagement(config.vespa_client) try: index = index_management.get_index(index_name) except IndexNotFoundError as e: @@ -78,7 +82,7 @@ def _refresh_index(config: Config, index_name: str) -> None: index_info_cache[index_name] = index -def _check_refresh_thread(config: Config): +def _check_refresh_thread(index_management: IndexManagement): if refresh_lock.locked(): # Another thread is running this function, skip as concurrent changes to the thread can error out logger.debug('Refresh thread is locked. Skipping') @@ -98,7 +102,7 @@ def refresh(): try: global cache_refresh_last_logged_time - populate_cache(config) + populate_cache(index_management) if time.time() - cache_refresh_last_logged_time > cache_refresh_log_interval: cache_refresh_last_logged_time = time.time() @@ -126,15 +130,14 @@ def refresh(): def start_refresh_thread(config: Config): - _check_refresh_thread(config) + _check_refresh_thread(config.index_management) -def populate_cache(config: Config): +def populate_cache(index_management: IndexManagement): """ Refresh cache for all indexes """ global index_info_cache - index_management = IndexManagement(config.vespa_client) indexes = index_management.get_all_indexes() # Enable caching and reset any existing model caches diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index c0d6cf3e7..a15646254 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -110,7 +110,7 @@ def add_documents(config: Config, add_docs_params: AddDocsParams) -> MarqoAddDoc """ try: marqo_index = index_meta_cache.get_index( - config=config, index_name=add_docs_params.index_name, force_refresh=True + index_management=config.index_management, index_name=add_docs_params.index_name, force_refresh=True ) # TODO: raise core_exceptions.IndexNotFoundError instead (fix associated tests) @@ -1405,7 +1405,7 @@ def _get_latest_index(config: Config, index_name: str) -> MarqoIndex: never change. It also makes sure we always get the latest version of semi-structured index to guarantee the strong consistency. """ - marqo_index = index_meta_cache.get_index(config=config, index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=config.index_management, index_name=index_name) if marqo_index.type == IndexType.SemiStructured: return config.index_management.get_index(index_name=index_name) return marqo_index @@ -1677,7 +1677,7 @@ def _lexical_search( f"Query arg must be of type str! text arg is of type {type(text)}. " f"Query arg: {text}") - marqo_index = index_meta_cache.get_index(config=config, index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=config.index_management, index_name=index_name) # SEARCH TIMER-LOGGER (pre-processing) RequestMetricsStore.for_request().start("search.lexical.processing_before_vespa") @@ -2136,7 +2136,7 @@ def _vector_text_search( RequestMetricsStore.for_request().start("search.vector.processing_before_vespa") - marqo_index = index_meta_cache.get_index(config=config, index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=config.index_management, index_name=index_name) # Determine the text query prefix text_query_prefix = marqo_index.model.get_text_query_prefix(text_query_prefix) @@ -2699,7 +2699,7 @@ def vectorise_multimodal_combination_field_structured( def delete_documents(config: Config, index_name: str, doc_ids: List[str]): """Delete documents from the Marqo index with the given doc_ids """ # Make sure the index exists - marqo_index = index_meta_cache.get_index(config=config, index_name=index_name) + marqo_index = index_meta_cache.get_index(index_management=config.index_management, index_name=index_name) return delete_docs.delete_documents( config=config, diff --git a/tests/core/document/test_partial_document_update.py b/tests/core/document/test_partial_document_update.py index dca74c86d..fda6bcd29 100644 --- a/tests/core/document/test_partial_document_update.py +++ b/tests/core/document/test_partial_document_update.py @@ -133,7 +133,7 @@ def _set_up_for_text_field_test(self): "text_field_tensor": "text field tensor", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -149,7 +149,7 @@ def _set_up_for_int_field_test(self): "text_field_tensor": "text field tensor", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -165,7 +165,7 @@ def _set_up_for_float_field_test(self): "text_field_tensor": "text field tensor", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -369,7 +369,7 @@ def test_update_bool_field_filter(self): "text_field_tensor": "search me", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -402,7 +402,7 @@ def test_update_image_pointer_field(self): "text_field_tensor": "search me", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -455,7 +455,7 @@ def test_update_multimodal_image_field(self): mappings = None # Add the original document - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -470,7 +470,7 @@ def test_update_multimodal_image_field(self): "_id": "1", "image_field_1": updated_image_url } - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -506,7 +506,7 @@ def test_update_multimodal_dependent_field(self): "dependent_field_2": "dependent field 2", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -529,7 +529,7 @@ def test_update_array_text_field_filter(self): "text_field_tensor": "search me", "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_doc] )) @@ -657,7 +657,7 @@ def test_multi_threading_update(self): "_id": "1" } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_index_name, docs=[original_document] )) @@ -733,7 +733,7 @@ def test_multi_threading_update_for_large_score_modifier_fields(self): original_document["text_field_tensor"] = "text field tensor" original_document["_id"] = "1" - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.large_score_modifier_index_name, docs=[original_document] )) diff --git a/tests/core/monitoring/test_monitoring.py b/tests/core/monitoring/test_monitoring.py index c0b110405..760a09a36 100644 --- a/tests/core/monitoring/test_monitoring.py +++ b/tests/core/monitoring/test_monitoring.py @@ -155,7 +155,7 @@ def test_get_index_stats_docsWithTensorFields_successful(self): """ for marqo_index in self.indexes_to_test: with self.subTest(f'{marqo_index.name} - {marqo_index.type.value}'): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=[{"title": "2"}, {"title": "2"}, {"title": "62"}], index_name=marqo_index.name, @@ -177,7 +177,7 @@ def test_get_index_stats_structuredMultimodalIndex_successful(self): get_index_stats returns the correct stats for a multimodal index """ marqo_index = self.structured_index_multimodal - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=[ {"title": "2", @@ -205,7 +205,7 @@ def test_get_index_stats_docsWithoutTensorFields_successful(self): """ for marqo_index in self.indexes_to_test: with self.subTest(f'{marqo_index.name} - {marqo_index.type.value}'): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=[{"desc": "2"}, {"desc": "2"}, {"desc": "62"}], index_name=marqo_index.name, @@ -228,7 +228,7 @@ def test_get_index_stats_mixedDocs_successful(self): """ for marqo_index in self.indexes_to_test: with self.subTest(f'{marqo_index.name} - {marqo_index.type.value}'): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=[{"title": "2"}, {"title": "2"}, {"desc": "62"}], index_name=marqo_index.name, @@ -281,7 +281,7 @@ def test_get_index_stats_sequentialIndexingAndDeletion_successful(self): with self.subTest(f'{marqo_index.name} - {marqo_index.type.value}'): for operation, docs, expected_stats in operations: if operation == 'add': - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=docs, index_name=marqo_index.name, @@ -309,7 +309,7 @@ def test_get_index_stats_longText_successful(self): for marqo_index in self.indexes_to_test: with self.subTest(f'{marqo_index.name} - {marqo_index.type.value}'): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=[{"title": "test " * number_of_words}, {"title": "2"}], # 3 + 1 vectors expected index_name=marqo_index.name, @@ -332,7 +332,7 @@ def test_get_index_stats_by_name_docsWithTensorFields_successful(self): """ for marqo_index in self.indexes_to_test: with self.subTest(f'{marqo_index.name} - {marqo_index.type.value}'): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( docs=[{"title": "2"}, {"title": "2"}, {"title": "62"}], index_name=marqo_index.name, diff --git a/tests/core/search/test_recommender.py b/tests/core/search/test_recommender.py index 9da547762..128a50b2f 100644 --- a/tests/core/search/test_recommender.py +++ b/tests/core/search/test_recommender.py @@ -159,7 +159,7 @@ def _populate_index(self, index: MarqoIndex): else: tensor_fields = None - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -354,7 +354,7 @@ def test_recommend_docsWithoutVectors_success(self): } ] index = self.unstructured_text_index - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( index_name=index.name, diff --git a/tests/marqo_test.py b/tests/marqo_test.py index 1ba0fae2f..2eaae9f35 100644 --- a/tests/marqo_test.py +++ b/tests/marqo_test.py @@ -84,11 +84,9 @@ def create_indexes(cls, index_requests: List[MarqoIndexRequest]) -> List[MarqoIn return indexes @classmethod - def add_documents_and_refresh_index(cls, *args, **kwargs): - index_name = kwargs['add_docs_params'].index_name - result = tensor_search.add_documents(*args, **kwargs) - index_meta_cache.get_index(config=cls.config, index_name=index_name, force_refresh=True) - return result + def add_documents(cls, *args, **kwargs): + # TODO change to use config.document.add_documents when tensor_search.add_documents is removed + return tensor_search.add_documents(*args, **kwargs) def setUp(self) -> None: self.clear_indexes(self.indexes) diff --git a/tests/s2_inference/test_generic_clip_model.py b/tests/s2_inference/test_generic_clip_model.py index 1c06f79a3..c1c965805 100644 --- a/tests/s2_inference/test_generic_clip_model.py +++ b/tests/s2_inference/test_generic_clip_model.py @@ -72,9 +72,9 @@ def test_create_index_and_add_documents_with_generic_open_clip_model_properties_ "desc 2": "content 2. blah blah blah" }] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs, device="cpu") - ) + ) # test if we can get the document by _id assert tensor_search.get_document_by_id( @@ -93,7 +93,7 @@ def test_create_index_and_add_documents_with_generic_open_clip_model_properties_ "desc 2": "test again test again test again" }] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs2, device="cpu")) assert tensor_search.get_document_by_id( @@ -138,7 +138,7 @@ def test_pipeline_with_generic_openai_clip_model_properties_url(self): "desc 2": "content 2. blah blah blah" }] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_2, docs=docs, device="cpu" )) @@ -157,7 +157,7 @@ def test_pipeline_with_generic_openai_clip_model_properties_url(self): "desc 2": "test again test again test again" }] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_2, docs=docs2, device="cpu")) assert tensor_search.get_document_by_id( @@ -205,7 +205,7 @@ def test_pipeline_with_generic_open_clip_model_properties_localpath(self): "desc 2": "content 2. blah blah blah" }] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs, device="cpu")) assert tensor_search.get_document_by_id( @@ -223,7 +223,7 @@ def test_pipeline_with_generic_open_clip_model_properties_localpath(self): "desc 2": "test again test again test again" }] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs2, device="cpu")) assert tensor_search.get_document_by_id( @@ -326,7 +326,7 @@ def test_add_documents_text_and_image(self): "image" : TestImageUrls.COCO.value }] - self.add_documents_and_refresh_index(config=config, add_docs_params=AddDocsParams( + self.add_documents(config=config, add_docs_params=AddDocsParams( index_name=index_name, docs=docs, device="cpu")) diff --git a/tests/s2_inference/test_generic_model.py b/tests/s2_inference/test_generic_model.py index eb393f9cb..de10797a3 100644 --- a/tests/s2_inference/test_generic_model.py +++ b/tests/s2_inference/test_generic_model.py @@ -93,7 +93,7 @@ def test_add_documents(self): "desc 2": "content 2. blah blah blah" }] - self.add_documents_and_refresh_index(config=config, add_docs_params=AddDocsParams( + self.add_documents(config=config, add_docs_params=AddDocsParams( index_name=index_name, docs=docs, device="cpu")) def test_validate_model_properties_missing_required_keys(self): diff --git a/tests/tensor_search/backwards_compat/test_search_regression.py b/tests/tensor_search/backwards_compat/test_search_regression.py index 832885593..fa983e3a7 100644 --- a/tests/tensor_search/backwards_compat/test_search_regression.py +++ b/tests/tensor_search/backwards_compat/test_search_regression.py @@ -115,7 +115,7 @@ def test_search_result_scores_match_2_9(self): docs_with_same_bm25_score.append(("doc7", "doc11")) # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -163,7 +163,7 @@ def test_document_vectors_match_2_9(self): for index in [self.structured_text_index_score_modifiers, self.unstructured_text_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 512cb1f3d..36cdfda6f 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -164,7 +164,7 @@ def test_add_documents_with_truncated_image(self): tensor_fields = ["image_field_1", "text_field_1"] if index_name != self.structured_marqo_index_name \ else None with self.subTest(f"test add documents with truncated image for {index_name}"): - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -193,7 +193,7 @@ def test_add_document_callVectoriseWithoutPassingEnableCache(self): else None with self.subTest(index_name): with patch("marqo.s2_inference.s2_inference.vectorise", return_value=dummy_return) as mock_vectorise: - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -231,7 +231,7 @@ def test_add_multimodal_single_documents(self): for index_name in [self.structured_languagebind_index_name, self.semi_structured_languagebind_index_name, self.unstructured_languagebind_index_name]: with self.subTest(index_name): - res = self.add_documents_and_refresh_index( + res = self.add_documents( self.config, add_docs_params=AddDocsParams( docs=documents, @@ -297,7 +297,7 @@ def test_add_multimodal_field_document(self): }, } } if "unstructured" in index_name else None - res = self.add_documents_and_refresh_index( + res = self.add_documents( self.config, add_docs_params=AddDocsParams( docs=multimodal_document, @@ -340,7 +340,7 @@ def test_imageRepoHandleThreadHandleError_successfully(self): with patch("marqo.s2_inference.clip_utils.requests.get", side_effect=error) \ as mock_requests_get: with self.assertRaises(Exception) as e: - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -364,7 +364,7 @@ def test_addDocumentsPassTensorToVectorise(self): else None with self.subTest(index_name): with patch("marqo.s2_inference.s2_inference.vectorise", return_value=dummy_return) as mock_vectorise: - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -398,7 +398,7 @@ def test_downloadImagesThreadCount(self): add_docs, 'threaded_download_and_preprocess_content', wraps=add_docs.threaded_download_and_preprocess_content ) as mock_download_images: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, docs=docs, device="cpu", image_download_thread_count=thread_count, @@ -422,7 +422,7 @@ def test_image_url_is_embedded_as_image_not_text(self): tensor_fields = ["image_field_1"] if index_name != self.structured_marqo_index_name \ else None with self.subTest(index_name): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -481,7 +481,7 @@ def test_multimodal_image_url_is_embedded_as_image_not_text(self): tensor_fields = None mappings = None - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -606,7 +606,7 @@ def test_resilient_add_images(self): with self.subTest(index_name): for docs, expected_results in docs_results: with self.subTest(f'{expected_results} - {index_name}'): - add_res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + add_res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index_name, docs=docs, device="cpu", tensor_fields=tensor_fields)).dict( exclude_none=True, by_alias=True) self.assertEqual(len(expected_results), len(add_res['items'])) @@ -754,7 +754,7 @@ def test_idErrorWhenImageDownloading(self): tensor_fields = ["image_field_1", "text_field_1"] if index_name != self.structured_marqo_index_name \ else None with self.subTest(index_name): - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -897,7 +897,7 @@ def all_embeddings(get_documents_results: list) -> Dict[str, List[float]]: if isinstance(index, UnstructuredMarqoIndex) else None def add_docs(batch_vectorisation_mode: BatchVectorisationMode): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, diff --git a/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py b/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py index 8c59711f6..f395fefe1 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py +++ b/tests/tensor_search/integ_tests/test_add_documents_semi_structured.py @@ -74,7 +74,7 @@ def test_add_plain_id_field(self): ] for index_name, desc in tests: with self.subTest(desc): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -103,7 +103,7 @@ def test_add_documents_dupe_ids(self): """ # Add once to get vectors - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -117,7 +117,7 @@ def test_add_documents_dupe_ids(self): config=self.config, index_name=self.default_text_index, document_id="1", show_vectors=True)['_tensor_facets'] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -129,7 +129,7 @@ def test_add_documents_dupe_ids(self): device="cpu", tensor_fields=["title"] ) ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -157,7 +157,7 @@ def test_add_documents_with_missing_index_fails(self): rand_index = 'a' + str(uuid.uuid4()).replace('-', '') with pytest.raises(IndexNotFoundError): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=rand_index, docs=[{"abc": "def"}], device="cpu" ) @@ -176,7 +176,7 @@ def test_add_documents_whitespace(self): {"title": "\r\r"}, {"title": "\r\t\n"}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=docs, device="cpu", tensor_fields=[] ) @@ -188,7 +188,7 @@ def test_add_documents_whitespace(self): assert count == len(docs) def test_add_docs_response_format(self): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -252,7 +252,7 @@ def test_add_documents_validation(self): for use_existing_tensors_flag in (True, False): for bad_doc_arg in bad_doc_args: with self.subTest(msg=f'{bad_doc_arg} - use_existing_tensors={use_existing_tensors_flag}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg, use_existing_tensors=use_existing_tensors_flag, device="cpu", @@ -285,7 +285,7 @@ def test_add_documents_id_validation(self): for use_existing_tensors_flag in (True, False): for bad_doc_arg in bad_doc_args: with self.subTest(f'{bad_doc_arg} - use_existing_tensors={use_existing_tensors_flag}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg[0], use_existing_tensors=use_existing_tensors_flag, device="cpu", tensor_fields=["title"] @@ -307,7 +307,7 @@ def test_add_documents_list_success(self): [{"_id": "to_fail_123", "tags": ["wow", "this", "is"]}] ] for bad_doc_arg in good_docs: - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg, @@ -327,7 +327,7 @@ def test_add_documents_list_data_type_validation(self): bad_doc_args = self.tags_ for bad_doc_arg in bad_doc_args: with self.subTest(bad_doc_arg): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg, @@ -347,7 +347,7 @@ def test_add_documents_set_device(self): @mock.patch("marqo.s2_inference.s2_inference.vectorise", mock_vectorise) def run(): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, device="cuda:22", docs=[{"title": "doc"}, {"title": "doc"}], tensor_fields=["title"] @@ -364,7 +364,7 @@ def test_add_documents_empty(self): Adding empty documents raises BadRequestError """ try: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[], device="cpu") @@ -383,8 +383,8 @@ def test_add_documents_id_image_url(self): ] with mock.patch('PIL.Image.open') as mock_image_open: - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=self.default_image_index, docs=docs, device="cpu", tensor_fields=["title"] )) @@ -433,7 +433,7 @@ def test_add_documents_resilient_doc_validation(self): ] for docs, expected_results in docs_results: with self.subTest(f'{expected_results}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=docs, device="cpu", tensor_fields=[] @@ -450,7 +450,7 @@ def test_add_documents_resilient_doc_validation(self): def test_add_document_with_tensor_fields(self): """Ensure tensor_fields only works for title but not desc""" docs_ = [{"_id": "789", "title": "Story of Alice Appleseed", "desc": "Alice grew up in Houston, Texas."}] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=docs_, device="cpu", tensor_fields=["title"] )) resp = tensor_search.get_document_by_id(config=self.config, @@ -468,7 +468,7 @@ def test_doc_too_large(self): @mock.patch.dict(os.environ, {**os.environ, **mock_environ}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ {"_id": "123", 'desc': "edf " * (max_size // 4)}, @@ -493,7 +493,7 @@ def test_doc_too_large_single_doc(self): @mock.patch.dict(os.environ, {**os.environ, **mock_environ}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ {"_id": "123", 'desc': "edf " * (max_size // 4)}, @@ -516,7 +516,7 @@ def test_doc_too_large_none_env_var(self): for env_dict in [dict()]: @mock.patch.dict(os.environ, {**os.environ, **env_dict}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ {"_id": "123", 'desc': "Some content"}, @@ -536,13 +536,13 @@ def test_remove_tensor_field(self): If a document is re-indexed with a tensor field removed, the vectors are removed """ # test replace and update workflows - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "title": "mydata", "desc": "mydata2"}], index_name=self.default_text_index, device="cpu", tensor_fields=["title"] ) ) - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "desc": "mydata"}], @@ -572,7 +572,7 @@ def test_add_documents_exceeded_max_doc_count(self): if error: with self.assertRaises(BadRequestError): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -584,7 +584,7 @@ def test_add_documents_exceeded_max_doc_count(self): ) else: self.assertEqual(False, - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -600,7 +600,7 @@ def test_no_tensor_field_on_empty_ix(self): """ If a document is indexed with no tensor fields on an empty index, no vectors are added """ - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "desc": "mydata"}], index_name=self.default_text_index, @@ -616,7 +616,7 @@ def test_index_doc_on_empty_ix(self): """ If a document is indexed with a tensor field vectors are added for the tensor field """ - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "title": "mydata", "desc": "mydata"}], index_name=self.default_text_index, tensor_fields=["title"], @@ -661,7 +661,7 @@ def _check_get_docs(doc_count, title_value): for c in doc_counts: self.clear_index_by_name(self.image_index_with_random_model) - res1 = self.add_documents_and_refresh_index( + res1 = self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": str(doc_num), @@ -690,7 +690,7 @@ def test_bad_tensor_fields(self): for tensor_fields, error_message, msg in test_cases: with self.subTest(msg): with self.assertRaises(BadRequestError) as e: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.default_text_index, docs=[{"some": "data"}], **tensor_fields)) @@ -715,7 +715,7 @@ def test_supported_large_integer_and_float_number(self): for doc, error in test_case: with self.subTest(): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[doc], device="cpu", tensor_fields=[] @@ -741,8 +741,8 @@ def test_duplicate_ids_behaviour(self): for documents, number_of_docs, msg in test_cases: self.clear_index_by_name(self.default_text_index) with self.subTest(msg): - r = self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + r = self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=documents, device="cpu", tensor_fields=["text_field"] )).dict(exclude_none=True, by_alias=True) diff --git a/tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py b/tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py index 78362abb1..ba3a1a77b 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py +++ b/tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py @@ -63,7 +63,7 @@ def tearDown(self) -> None: self.device_patcher.stop() def _add_and_get_doc(self, index_name: str, doc_id: str, tensor_fields: List[str], use_existing_tensors=False): - add_doc_result = self.add_documents_and_refresh_index( + add_doc_result = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, docs=[{ @@ -139,7 +139,7 @@ def test_add_documents_should_add_string_fields_as_lexical_fields(self): self.assertEqual(1, len(res['hits'])) def test_add_documents_should_add_custom_vector_field_content_as_lexical_fields(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.text_index_3, docs=[{ @@ -168,7 +168,7 @@ def test_add_documents_should_add_custom_vector_field_content_as_lexical_fields( self.assertIn('marqo__lexical_custom_vector_field', updated_index.lexical_field_map.keys()) def test_add_documents_should_add_image_field_as_lexical_fields(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.image_index_with_chunking, docs=[{ @@ -193,7 +193,7 @@ def test_add_documents_should_add_image_field_as_lexical_fields(self): self.assertIn('marqo__lexical_image_field', updated_index.lexical_field_map.keys()) def test_add_documents_should_add_multimodal_subfield_as_lexical_fields(self): - add_doc_result = self.add_documents_and_refresh_index( + add_doc_result = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.text_index_4, docs=[{ @@ -292,8 +292,6 @@ def test_add_documents_should_allow_the_same_field_to_have_different_types_in_di ) ) - index_meta_cache.get_index(config=self.config, index_name=self.text_index_5, force_refresh=True) - res = tensor_search.search( text="content", search_method=SearchMethod.TENSOR, config=self.config, index_name=self.text_index_5, diff --git a/tests/tensor_search/integ_tests/test_add_documents_structured.py b/tests/tensor_search/integ_tests/test_add_documents_structured.py index 5c25fcca1..fc8e96a6b 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_structured.py +++ b/tests/tensor_search/integ_tests/test_add_documents_structured.py @@ -196,7 +196,7 @@ def test_add_plain_id_field(self): ] for index_name, desc in tests: with self.subTest(desc): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, docs=[{ @@ -248,7 +248,7 @@ def test_boolean_field(self): for index_name, desc in test_indexes: for test_case in test_cases: with self.subTest(test_case[0] + ' - ' + desc): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, docs=[ @@ -271,7 +271,7 @@ def test_add_documents_dupe_ids(self): """ # Add once to get vectors - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{ @@ -285,7 +285,7 @@ def test_add_documents_dupe_ids(self): config=self.config, index_name=self.index_name_1, document_id="1", show_vectors=True)['_tensor_facets'] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ @@ -297,7 +297,7 @@ def test_add_documents_dupe_ids(self): device="cpu" ) ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ @@ -325,7 +325,7 @@ def test_add_documents_with_missing_index_fails(self): rand_index = 'a' + str(uuid.uuid4()).replace('-', '') with pytest.raises(IndexNotFoundError): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=rand_index, docs=[{"abc": "def"}], auto_refresh=True, device="cpu" ) @@ -344,7 +344,7 @@ def test_add_documents_whitespace(self): {"title": "\r\r"}, {"title": "\r\t\n"}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs, device="cpu" ) @@ -356,7 +356,7 @@ def test_add_documents_whitespace(self): assert count == len(docs) def test_add_docs_response_format(self): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ @@ -421,7 +421,7 @@ def test_add_documents_validation(self): for use_existing_tensors_flag in (True, False): for bad_doc_arg in bad_doc_args: with self.subTest(f'{bad_doc_arg} - use_existing_tensors={use_existing_tensors_flag}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=bad_doc_arg, use_existing_tensors=use_existing_tensors_flag, device="cpu" @@ -453,7 +453,7 @@ def test_add_documents_id_validation(self): for use_existing_tensors_flag in (True, False): for bad_doc_arg in bad_doc_args: with self.subTest(f'{bad_doc_arg} - use_existing_tensors={use_existing_tensors_flag}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=bad_doc_arg[0], use_existing_tensors=use_existing_tensors_flag, device="cpu" @@ -474,7 +474,7 @@ def test_add_documents_list_success(self): [{"_id": "to_fail_123", "tags": ["wow", "this", "is"]}] ] for bad_doc_arg in good_docs: - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=bad_doc_arg, @@ -492,7 +492,7 @@ def test_add_documents_list_data_type_validation(self): ] for bad_doc_arg in bad_doc_args: with self.subTest(bad_doc_arg): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=bad_doc_arg, @@ -511,7 +511,7 @@ def test_add_documents_set_device(self): @mock.patch("marqo.s2_inference.s2_inference.vectorise", mock_vectorise) def run(): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, device="cuda:22", docs=[{"title": "doc"}, {"title": "doc"}], @@ -528,7 +528,7 @@ def test_add_documents_empty(self): Adding empty documents raises BadRequestError """ try: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[], device="cpu") @@ -547,8 +547,8 @@ def test_add_documents_id_image_url(self): ] with mock.patch('PIL.Image.open') as mock_image_open: - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=self.index_name_img_no_chunking, docs=docs, device="cpu", )) @@ -598,7 +598,7 @@ def test_add_documents_resilient_doc_validation(self): ] for docs, expected_results in docs_results: with self.subTest(f'{expected_results}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs, device="cpu" @@ -614,7 +614,7 @@ def test_add_documents_resilient_doc_validation(self): def test_add_document_with_tensor_fields(self): docs_ = [{"_id": "789", "title": "Story of Alice Appleseed", "desc": "Alice grew up in Houston, Texas."}] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs_, device="cpu" )) resp = tensor_search.get_document_by_id(config=self.config, index_name=self.index_name_1, document_id="789", @@ -631,7 +631,7 @@ def test_doc_too_large(self): @mock.patch.dict(os.environ, {**os.environ, **mock_environ}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "123", 'desc': "edf " * (max_size // 4)}, @@ -656,7 +656,7 @@ def test_doc_too_large_single_doc(self): @mock.patch.dict(os.environ, {**os.environ, **mock_environ}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "123", 'desc': "edf " * (max_size // 4)}, @@ -679,7 +679,7 @@ def test_doc_too_large_none_env_var(self): for env_dict in [dict()]: @mock.patch.dict(os.environ, {**os.environ, **env_dict}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "123", 'desc': "Some content"}, @@ -710,7 +710,7 @@ def test_add_documents_exceeded_max_doc_count(self): if error: with self.assertRaises(BadRequestError): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{ @@ -721,7 +721,7 @@ def test_add_documents_exceeded_max_doc_count(self): ) else: self.assertEqual(False, - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{ @@ -737,13 +737,13 @@ def test_remove_tensor_field(self): If a document is re-indexed with a tensor field removed, the vectors are removed """ # test replace and update workflows - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "title": "mydata", "desc": "mydata2"}], index_name=self.index_name_1, device="cpu" ) ) - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "desc": "mydata"}], @@ -761,7 +761,7 @@ def test_no_tensor_field_on_empty_ix(self): """ If a document is indexed with no tensor fields on an empty index, no vectors are added """ - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "desc": "mydata"}], index_name=self.index_name_1, @@ -779,7 +779,7 @@ def test_index_doc_on_empty_ix(self): If a document is indexed with a tensor field and a non-tensor field on an empty index, vectors are added for the tensor field """ - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "title": "mydata", "desc": "mydata"}], index_name=self.index_name_1, @@ -825,7 +825,7 @@ def _check_get_docs(doc_count, title_value): for c in doc_counts: self.clear_index_by_name(self.index_name_img_random) - res1 = self.add_documents_and_refresh_index( + res1 = self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": str(doc_num), @@ -881,7 +881,7 @@ def test_add_long_double_numeric_values(self): for doc, error, msg in test_case: with self.subTest(msg): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[doc], device="cpu", ) @@ -910,7 +910,7 @@ def test_long_double_numeric_values_edge_case(self): for doc, expected_doc, msg in test_case: with self.subTest(msg): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[doc], device="cpu", ) @@ -945,7 +945,7 @@ def test_add_documents_nonImageContentForAnImageField(self): ] - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_img_no_chunking, docs=documents ) diff --git a/tests/tensor_search/integ_tests/test_add_documents_unstructured.py b/tests/tensor_search/integ_tests/test_add_documents_unstructured.py index ed111bf8f..87ce9b282 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_unstructured.py +++ b/tests/tensor_search/integ_tests/test_add_documents_unstructured.py @@ -85,7 +85,7 @@ def test_add_plain_id_field(self): ] for index_name, desc in tests: with self.subTest(desc): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -114,7 +114,7 @@ def test_add_documents_dupe_ids(self): """ # Add once to get vectors - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -128,7 +128,7 @@ def test_add_documents_dupe_ids(self): config=self.config, index_name=self.default_text_index, document_id="1", show_vectors=True)['_tensor_facets'] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -140,7 +140,7 @@ def test_add_documents_dupe_ids(self): device="cpu", tensor_fields=["title"] ) ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -168,7 +168,7 @@ def test_add_documents_with_missing_index_fails(self): rand_index = 'a' + str(uuid.uuid4()).replace('-', '') with pytest.raises(IndexNotFoundError): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=rand_index, docs=[{"abc": "def"}], device="cpu" ) @@ -187,7 +187,7 @@ def test_add_documents_whitespace(self): {"title": "\r\r"}, {"title": "\r\t\n"}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=docs, device="cpu", tensor_fields=[] ) @@ -199,7 +199,7 @@ def test_add_documents_whitespace(self): assert count == len(docs) def test_add_docs_response_format(self): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -263,7 +263,7 @@ def test_add_documents_validation(self): for use_existing_tensors_flag in (True, False): for bad_doc_arg in bad_doc_args: with self.subTest(msg=f'{bad_doc_arg} - use_existing_tensors={use_existing_tensors_flag}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg, use_existing_tensors=use_existing_tensors_flag, device="cpu", @@ -296,7 +296,7 @@ def test_add_documents_id_validation(self): for use_existing_tensors_flag in (True, False): for bad_doc_arg in bad_doc_args: with self.subTest(f'{bad_doc_arg} - use_existing_tensors={use_existing_tensors_flag}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg[0], use_existing_tensors=use_existing_tensors_flag, device="cpu", tensor_fields=["title"] @@ -318,7 +318,7 @@ def test_add_documents_list_success(self): [{"_id": "to_fail_123", "tags": ["wow", "this", "is"]}] ] for bad_doc_arg in good_docs: - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg, @@ -338,7 +338,7 @@ def test_add_documents_list_data_type_validation(self): bad_doc_args = self.tags_ for bad_doc_arg in bad_doc_args: with self.subTest(bad_doc_arg): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=bad_doc_arg, @@ -358,7 +358,7 @@ def test_add_documents_set_device(self): @mock.patch("marqo.s2_inference.s2_inference.vectorise", mock_vectorise) def run(): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, device="cuda:22", docs=[{"title": "doc"}, {"title": "doc"}], tensor_fields=["title"] @@ -375,7 +375,7 @@ def test_add_documents_empty(self): Adding empty documents raises BadRequestError """ try: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[], device="cpu") @@ -394,8 +394,8 @@ def test_add_documents_id_image_url(self): ] with mock.patch('PIL.Image.open') as mock_image_open: - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=self.default_image_index, docs=docs, device="cpu", tensor_fields=["title"] )) @@ -444,7 +444,7 @@ def test_add_documents_resilient_doc_validation(self): ] for docs, expected_results in docs_results: with self.subTest(f'{expected_results}'): - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=docs, device="cpu", tensor_fields=[] @@ -461,7 +461,7 @@ def test_add_documents_resilient_doc_validation(self): def test_add_document_with_tensor_fields(self): """Ensure tensor_fields only works for title but not desc""" docs_ = [{"_id": "789", "title": "Story of Alice Appleseed", "desc": "Alice grew up in Houston, Texas."}] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=docs_, device="cpu", tensor_fields=["title"] )) resp = tensor_search.get_document_by_id(config=self.config, @@ -479,7 +479,7 @@ def test_doc_too_large(self): @mock.patch.dict(os.environ, {**os.environ, **mock_environ}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ {"_id": "123", 'desc': "edf " * (max_size // 4)}, @@ -504,7 +504,7 @@ def test_doc_too_large_single_doc(self): @mock.patch.dict(os.environ, {**os.environ, **mock_environ}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ {"_id": "123", 'desc': "edf " * (max_size // 4)}, @@ -527,7 +527,7 @@ def test_doc_too_large_none_env_var(self): for env_dict in [dict()]: @mock.patch.dict(os.environ, {**os.environ, **env_dict}) def run(): - update_res = self.add_documents_and_refresh_index( + update_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ {"_id": "123", 'desc': "Some content"}, @@ -547,13 +547,13 @@ def test_remove_tensor_field(self): If a document is re-indexed with a tensor field removed, the vectors are removed """ # test replace and update workflows - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "title": "mydata", "desc": "mydata2"}], index_name=self.default_text_index, device="cpu", tensor_fields=["title"] ) ) - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "desc": "mydata"}], @@ -583,7 +583,7 @@ def test_add_documents_exceeded_max_doc_count(self): if error: with self.assertRaises(BadRequestError): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -595,7 +595,7 @@ def test_add_documents_exceeded_max_doc_count(self): ) else: self.assertEqual(False, - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[{ @@ -611,7 +611,7 @@ def test_no_tensor_field_on_empty_ix(self): """ If a document is indexed with no tensor fields on an empty index, no vectors are added """ - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "desc": "mydata"}], index_name=self.default_text_index, @@ -627,7 +627,7 @@ def test_index_doc_on_empty_ix(self): """ If a document is indexed with a tensor field vectors are added for the tensor field """ - self.add_documents_and_refresh_index( + self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": "123", "title": "mydata", "desc": "mydata"}], index_name=self.default_text_index, tensor_fields=["title"], @@ -672,7 +672,7 @@ def _check_get_docs(doc_count, title_value): for c in doc_counts: self.clear_index_by_name(self.image_index_with_random_model) - res1 = self.add_documents_and_refresh_index( + res1 = self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[{"_id": str(doc_num), @@ -701,7 +701,7 @@ def test_bad_tensor_fields(self): for tensor_fields, error_message, msg in test_cases: with self.subTest(msg): with self.assertRaises(BadRequestError) as e: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.default_text_index, docs=[{"some": "data"}], **tensor_fields)) @@ -726,7 +726,7 @@ def test_supported_large_integer_and_float_number(self): for doc, error in test_case: with self.subTest(): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[doc], device="cpu", tensor_fields=[] @@ -752,8 +752,8 @@ def test_duplicate_ids_behaviour(self): for documents, number_of_docs, msg in test_cases: self.clear_index_by_name(self.default_text_index) with self.subTest(msg): - r = self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + r = self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=documents, device="cpu", tensor_fields=["text_field"] )).dict(exclude_none=True, by_alias=True) diff --git a/tests/tensor_search/integ_tests/test_custom_vector_field.py b/tests/tensor_search/integ_tests/test_custom_vector_field.py index f929ae71e..05d1df8f5 100644 --- a/tests/tensor_search/integ_tests/test_custom_vector_field.py +++ b/tests/tensor_search/integ_tests/test_custom_vector_field.py @@ -141,7 +141,7 @@ def test_add_documents_with_custom_vector_field(self): @mock.patch("marqo.vespa.vespa_client.VespaClient.feed_batch", mock_feed_batch) def run(): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -198,7 +198,7 @@ def test_add_documents_with_custom_vector_field_no_content(self): @mock.patch("marqo.vespa.vespa_client.VespaClient.feed_batch", mock_feed_batch) def run(): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -267,7 +267,7 @@ def test_add_documents_with_different_field_types(self): @mock.patch("marqo.vespa.vespa_client.VespaClient.feed_batch", mock_feed_batch) def run(): - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -328,7 +328,7 @@ def test_add_documents_use_existing_tensors_with_custom_vector_field(self): for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): # If we change the custom vector, doc should change - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -348,7 +348,7 @@ def test_add_documents_use_existing_tensors_with_custom_vector_field(self): config=self.config, index_name=index.name, document_id="0", show_vectors=True) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -374,7 +374,7 @@ def test_add_documents_use_existing_tensors_with_custom_vector_field(self): assert get_doc_2[enums.TensorField.tensor_facets][0][enums.TensorField.embedding] == self.random_vector_2 # If we do not, it should remain the same, no errors - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -404,7 +404,7 @@ def test_get_document_with_custom_vector_field(self): """ for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -441,7 +441,7 @@ def test_get_documents_with_custom_vector_field(self): """ for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -534,7 +534,7 @@ def test_invalid_custom_vector_field_content(self): for case in test_cases: with self.subTest(f"Case: {case}"): with self.assertRaises(pydantic.ValidationError): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -553,7 +553,7 @@ def test_search_with_custom_vector_field(self): """ for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -615,7 +615,7 @@ def test_lexical_search_with_custom_vector_field(self): """ for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -673,7 +673,7 @@ def test_search_with_custom_vector_field_score_modifiers(self): # Using another field as score modifier on a custom vector: for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -740,7 +740,7 @@ def test_search_with_custom_vector_field_filter_string(self): for index in self.indexes: with self.subTest(f"Index: {index.name}, type: {index.type}"): - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -852,7 +852,7 @@ def test_search_with_custom_vector_field_searchable_attributes(self): if isinstance(index, UnstructuredMarqoIndex): break - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -919,7 +919,7 @@ def test_lexical_search_with_custom_vector_field_searchable_attributes(self): # Skip this test for unstructured indexes. if isinstance(index, UnstructuredMarqoIndex): break - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -1006,7 +1006,7 @@ def test_custom_vector_subfield_of_multimodal_should_fail_unstructured(self): for index in [self.unstructured_custom_index, self.unstructured_custom_index]: with self.subTest(f"Index: {index.name}, type: {index.type}"): - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_custom_index.name, docs=[ @@ -1055,7 +1055,7 @@ def test_search_with_custom_vector_field_boosting(self): }, } - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ @@ -1246,7 +1246,7 @@ def test_add_documents_with_custom_vector_normalize_embeddings_true(self): @mock.patch("marqo.vespa.vespa_client.VespaClient.feed_batch", mock_feed_batch) def run(): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -1306,7 +1306,7 @@ def test_add_documents_with_custom_vector_zero_vector_normalize_embeddings_true( ], errors=False) - add_documents_response = self.add_documents_and_refresh_index( + add_documents_response = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{ @@ -1349,7 +1349,7 @@ def test_search_with_custom_vector_field_normalize_embeddings_true(self): """ for index in [self.structured_custom_index, self.semi_structured_custom_index]: with self.subTest(f"Index: {index.name}, type: {index.type}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ diff --git a/tests/tensor_search/integ_tests/test_delete_documents.py b/tests/tensor_search/integ_tests/test_delete_documents.py index 61a4fff55..585d9d061 100644 --- a/tests/tensor_search/integ_tests/test_delete_documents.py +++ b/tests/tensor_search/integ_tests/test_delete_documents.py @@ -52,7 +52,7 @@ def tearDown(self) -> None: def test_delete_documents(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -67,7 +67,7 @@ def test_delete_documents(self): count0_res = self.monitoring.get_index_stats_by_name(index.name).number_of_documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -92,7 +92,7 @@ def test_delete_documents(self): def test_delete_docs_format(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -118,7 +118,7 @@ def test_delete_docs_format(self): def test_only_specified_documents_are_deleted(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -152,7 +152,7 @@ def test_only_specified_documents_are_deleted(self): def test_delete_multiple_documents(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -184,7 +184,7 @@ def test_delete_multiple_documents(self): def test_document_is_actually_deleted(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -206,7 +206,7 @@ def test_document_is_actually_deleted(self): def test_multiple_documents_are_actually_deleted(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -274,7 +274,7 @@ def test_delete_already_deleted_document(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): # Add a document - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -308,7 +308,7 @@ def test_delete_documents_mixed_valid_invalid_ids(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): # Add a document - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, diff --git a/tests/tensor_search/integ_tests/test_dict_score_modifiers.py b/tests/tensor_search/integ_tests/test_dict_score_modifiers.py index 804e85a3e..6f404a728 100644 --- a/tests/tensor_search/integ_tests/test_dict_score_modifiers.py +++ b/tests/tensor_search/integ_tests/test_dict_score_modifiers.py @@ -84,7 +84,7 @@ def test_double_score_modifier(self): for index in [self.structured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -118,7 +118,7 @@ def test_long_score_modifier(self): for index in [self.structured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -152,7 +152,7 @@ def test_add_to_score_map_score_modifier(self): for index in [self.structured_default_text_index, self.unstructured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -194,7 +194,7 @@ def test_multiply_score_by_map_score_modifier(self): for index in [self.structured_default_text_index, self.unstructured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -238,7 +238,7 @@ def test_combined_map_score_modifier(self): for index in [self.structured_default_text_index, self.unstructured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -283,7 +283,7 @@ def test_partial_document_update(self): for index in [self.structured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -345,7 +345,7 @@ def test_long_dict_score_modifier(self): for index in [self.structured_default_text_index, self.unstructured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -391,7 +391,7 @@ def test_unstructured_unsupported_map_error(self): for index in [self.unstructured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -429,7 +429,7 @@ def test_unstructured_wrong_map_numerical_format(self): for index in [self.unstructured_default_text_index]: with self.subTest(index=index.type): # Add documents - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -496,7 +496,7 @@ def test_unstructured_map_numerical_as_custom_vector(self): with self.subTest(index=index.type): # Add documents with self.assertRaises(ValidationError): - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, diff --git a/tests/tensor_search/integ_tests/test_embed.py b/tests/tensor_search/integ_tests/test_embed.py index 7af67fb80..a971ced5d 100644 --- a/tests/tensor_search/integ_tests/test_embed.py +++ b/tests/tensor_search/integ_tests/test_embed.py @@ -311,7 +311,7 @@ def test_embed_equivalent_to_add_docs(self): """ for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index.type): - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ @@ -356,7 +356,7 @@ def test_embed_equivalent_to_add_docs_with_prefix(self): """ for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index.type): - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ diff --git a/tests/tensor_search/integ_tests/test_get_document.py b/tests/tensor_search/integ_tests/test_get_document.py index 81daf565c..3edc8225a 100644 --- a/tests/tensor_search/integ_tests/test_get_document.py +++ b/tests/tensor_search/integ_tests/test_get_document.py @@ -47,7 +47,7 @@ def test_get_document(self): """Also ensures that the _id is returned""" for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=index.name, docs=[ { "_id": "123", @@ -101,11 +101,11 @@ def test_get_document_vectors_format(self): keys = ("title1", "desc2") vals = ("content 1", "content 2. blah blah blah") - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "123", **dict(zip(keys, vals))}], auto_refresh=True, device="cpu", tensor_fields=["title1", "desc2"] if isinstance(index, UnstructuredMarqoIndex) else None) - ) + ) res = tensor_search.get_document_by_id( config=self.config, index_name=index.name, diff --git a/tests/tensor_search/integ_tests/test_get_documents_by_ids.py b/tests/tensor_search/integ_tests/test_get_documents_by_ids.py index 1f61cf11b..e8937be9c 100644 --- a/tests/tensor_search/integ_tests/test_get_documents_by_ids.py +++ b/tests/tensor_search/integ_tests/test_get_documents_by_ids.py @@ -59,7 +59,7 @@ def test_get_documents_by_ids(self): {"_id": "1", "title1": "content 1"}, {"_id": "2", "title1": "content 2"}, {"_id": "3", "title1": "content 3"} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=index.name, docs=docs, device="cpu", tensor_fields=["title1", "desc2"] if isinstance(index, UnstructuredMarqoIndex) else None) @@ -83,7 +83,7 @@ def test_get_documents_vectors_format(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[dict(zip(k, v)) for k, v in zip(keys, vals)], device="cpu", tensor_fields=["title1", "desc2"] if isinstance(index, UnstructuredMarqoIndex) else None)) @@ -126,13 +126,13 @@ def test_get_document_vectors_non_existent(self): def test_get_document_vectors_resilient(self): for index in self.indexes: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ {"_id": '456', "title1": "alexandra"}, {'_id': '221', 'desc2': 'hello'}], device="cpu", tensor_fields=["title1", "desc2"] if isinstance(index, UnstructuredMarqoIndex) else None) - ) + ) id_reqs = [ (['123', '456'], [False, True]), ([['456', '789'], [True, False]]), ([['456', '789', '221'], [True, False, True]]), ([['vkj', '456', '4891'], [False, True, False]]) diff --git a/tests/tensor_search/integ_tests/test_hybrid_search.py b/tests/tensor_search/integ_tests/test_hybrid_search.py index 495092e44..7b0f781b8 100644 --- a/tests/tensor_search/integ_tests/test_hybrid_search.py +++ b/tests/tensor_search/integ_tests/test_hybrid_search.py @@ -213,7 +213,7 @@ def pass_through_query(*arg, **kwargs): if isinstance(index, UnstructuredMarqoIndex): # this is required to create the tensor fields in the semi-structured index - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -446,7 +446,7 @@ def pass_through_query(*arg, **kwargs): mock_vespa_client_query = unittest.mock.MagicMock() mock_vespa_client_query.side_effect = pass_through_query - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.semi_structured_index_with_no_model.name, @@ -579,7 +579,7 @@ def test_hybrid_search_disjunction_rrf_zero_alpha_same_as_lexical(self): self.unstructured_default_text_index]: with self.subTest(index=index.name): # Add documents - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -631,7 +631,7 @@ def test_hybrid_search_disjunction_rrf_one_alpha_same_as_tensor(self): self.unstructured_default_text_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -671,13 +671,12 @@ def test_hybrid_search_disjunction_rrf_one_alpha_same_as_tensor(self): def test_hybrid_search_searchable_attributes(self): """ Tests that searchable attributes work as expected for all methods - TODO: Add unstructured index once searchable attributes are supported """ for index in [self.structured_text_index_score_modifiers, self.semi_structured_default_text_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -742,8 +741,8 @@ def test_hybrid_search_searchable_attributes(self): self.assertIn("hits", hybrid_res) self.assertEqual(len(hybrid_res["hits"]), 3) # Only 3 documents have text field 2. Tensor retrieval will get them all. self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12") - self.assertEqual(hybrid_res["hits"][1]["_id"], "doc11") - self.assertEqual(hybrid_res["hits"][2]["_id"], "doc13") + # doc11 and doc13 has score 0, so their order is non-deterministic + self.assertSetEqual({'doc11', 'doc13'}, {hit["_id"] for hit in hybrid_res["hits"][1:]}) def test_hybrid_search_score_modifiers(self): """ @@ -754,7 +753,7 @@ def test_hybrid_search_score_modifiers(self): self.unstructured_default_text_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -907,7 +906,7 @@ def test_hybrid_search_lexical_tensor_with_lexical_score_modifiers_succeeds(self self.unstructured_default_text_index]: with self.subTest(index=type(index)): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -974,7 +973,7 @@ def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self): self.unstructured_default_text_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -1028,7 +1027,7 @@ def test_hybrid_search_with_filter(self): self.unstructured_default_text_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -1075,7 +1074,7 @@ def test_hybrid_search_with_images(self): self.unstructured_default_image_index]: with self.subTest(index=index.name): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -1142,7 +1141,7 @@ def test_hybrid_search_structured_opposite_retrieval_and_ranking(self): """ # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_text_index_score_modifiers.name, @@ -1254,7 +1253,7 @@ def test_hybrid_search_semi_structured_opposite_retrieval_and_ranking(self): """ # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.semi_structured_default_image_index.name, @@ -1367,7 +1366,7 @@ def test_hybrid_search_highlights_for_lexical_tensor(self): with self.subTest(msg=f'{index.type}', index=index): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -1753,8 +1752,8 @@ def test_hybrid_search_none_query_with_context_vectors_passes(self): if isinstance(index, UnstructuredMarqoIndex) else None, mappings={"custom_field_1": {"type": "custom_vector"}} \ if isinstance(index, UnstructuredMarqoIndex) else None) - _ = self.add_documents_and_refresh_index(config=self.config, - add_docs_params=add_docs_params) + _ = self.add_documents(config=self.config, + add_docs_params=add_docs_params) r = tensor_search.search(config=self.config, index_name=index.name, text=None, search_method="hybrid", diff --git a/tests/tensor_search/integ_tests/test_no_model.py b/tests/tensor_search/integ_tests/test_no_model.py index 0d699bd9b..2ac1818c7 100644 --- a/tests/tensor_search/integ_tests/test_no_model.py +++ b/tests/tensor_search/integ_tests/test_no_model.py @@ -119,7 +119,7 @@ def test_no_model_in_add_documents_error(self): index_name == self.unstructured_index_with_no_model else None mappings = {"custom_field_1": {"type": "custom_vector"}} if \ index_name == self.unstructured_index_with_no_model else None - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -183,8 +183,8 @@ def test_no_model_work_with_context_vectors_in_search(self): docs=docs, tensor_fields=tensor_fields, mappings=mappings) - _ = self.add_documents_and_refresh_index(config=self.config, - add_docs_params=add_docs_params) + _ = self.add_documents(config=self.config, + add_docs_params=add_docs_params) r = tensor_search.search(config=self.config, index_name=index_name, text=None, search_method="tensor", diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index 25eff2566..e5e26674f 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -211,7 +211,7 @@ def test_search_video(self): ] for index in [self.unstructured_languagebind_index, self.structured_languagebind_index]: with self.subTest(index=index.type): - response = self.add_documents_and_refresh_index( + response = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -246,7 +246,7 @@ def test_search_audio(self): ] for index in [self.unstructured_languagebind_index, self.structured_languagebind_index]: with self.subTest(index=index.type): - response = self.add_documents_and_refresh_index( + response = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -272,7 +272,7 @@ def test_search_audio(self): def test_filtering_list_case_tensor(self): for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(type=index.type): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -320,7 +320,7 @@ def test_filtering_list_case_tensor(self): def test_filtering_list_case_lexical(self): for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index.type): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -368,7 +368,7 @@ def test_filtering_list_case_image(self): for index in [self.unstructured_default_image_index, self.structured_default_image_index]: with self.subTest(index=index): hippo_img = TestImageUrls.HIPPO_REALISTIC.value - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -421,7 +421,7 @@ def test_filtering(self): for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index.type): # Add documents first - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -554,7 +554,7 @@ def test_filter_id(self): """ for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index.type): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -606,7 +606,7 @@ def test_filter_spaced_fields(self): for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index.type): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -652,7 +652,7 @@ def test_filtering_bad_syntax(self): for index in [self.unstructured_default_text_index, self.structured_default_text_index]: with self.subTest(index=index): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -699,7 +699,7 @@ def test_filtering_in_with_wrong_type(self): for index in [self.structured_default_text_index]: with self.subTest(index=index.type): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -754,7 +754,7 @@ def test_empty_lexical_query(self): """ for index in [self.structured_default_text_index, self.unstructured_default_text_index]: with self.subTest(index=index.type): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -784,7 +784,7 @@ def test_wildcard_lexical_query(self): """ for index in [self.structured_default_text_index, self.unstructured_default_text_index]: with self.subTest(index=index.type): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -839,7 +839,7 @@ def test_LexicalSearchResultsScore(self): with self.subTest(msg=index.type): tensor_fields = ["text_field_1", "text_field_2"] \ if isinstance(index, UnstructuredMarqoIndex) else None - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, diff --git a/tests/tensor_search/integ_tests/test_search_semi_structured.py b/tests/tensor_search/integ_tests/test_search_semi_structured.py index 030bbb3b8..8f06f02e2 100644 --- a/tests/tensor_search/integ_tests/test_search_semi_structured.py +++ b/tests/tensor_search/integ_tests/test_search_semi_structured.py @@ -89,8 +89,8 @@ def test_each_doc_returned_once(self): ] for index_name, desc in tests: with self.subTest(desc): - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=index_name, docs=[ {"abc": "Exact match hehehe efgh ", "other_field": "baaadd efgh ", @@ -100,7 +100,7 @@ def test_each_doc_returned_once(self): ], tensor_fields=["abc", "other_field", "finally"], ) - ) + ) search_res = tensor_search._vector_text_search( config=self.config, index_name=index_name, @@ -178,7 +178,7 @@ def test_vector_search_long_query_string(self): The editor-in-chief Katharine Viner succeeded Alan Rusbridger in 2015.[10][11] Since 2018, the paper's main newsprint sections have been published in tabloid format. As of July 2021, its print edition had a daily circulation of 105,134.[4] The newspaper has an online edition, TheGuardian.com, as well as two international websites, Guardian Australia (founded in 2013) and Guardian US (founded in 2011). The paper's readership is generally on the mainstream left of British political opinion,[12][13][14][15] and the term "Guardian reader" is used to imply a stereotype of liberal, left-wing or "politically correct" views.[3] Frequent typographical errors during the age of manual typesetting led Private Eye magazine to dub the paper the "Grauniad" in the 1960s, a nickname still used occasionally by the editors for self-mockery.[16] """ - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -199,7 +199,7 @@ def test_vector_search_long_query_string(self): def test_search_edge_case(self): """We ran into bugs with this doc""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -229,7 +229,7 @@ def test_search_format(self): """Is the result formatted correctly?""" q = "Exact match hehehe" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -282,7 +282,7 @@ def test_search_format_empty(self): assert search_res["limit"] > 0 def test_result_count_validation(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -326,7 +326,7 @@ def test_result_count_validation(self): assert len(search_res['hits']) >= 1 def test_highlights_tensor(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -351,7 +351,7 @@ def test_highlights_tensor(self): assert "_highlights" not in hit def test_highlights_lexical(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -377,7 +377,7 @@ def test_highlights_lexical(self): def test_search_int_field(self): """doesn't error out if there is a random int field""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -396,7 +396,7 @@ def test_search_int_field(self): assert len(s_res["hits"]) > 0 def test_filtering_list_case_tensor(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -438,7 +438,7 @@ def test_filtering_list_case_tensor(self): assert len(res_should_only_match_keyword_good["hits"]) == 1 def test_filtering_list_case_lexical(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -471,7 +471,7 @@ def test_filtering_list_case_lexical(self): def test_filtering_list_case_image(self): hippo_img = TestImageUrls.HIPPO_REALISTIC.value - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -504,7 +504,7 @@ def test_filtering_list_case_image(self): def test_filtering(self): # TODO-Li Add support for filter on Bool # Add documents first (assuming add_docs_caller is a method to add documents) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -550,7 +550,7 @@ def test_filtering_string_boolean_and_real_boolean_fields(self): "bool_field_1": False, "bool_field_2": True, "text_field_3": "search me"}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -589,7 +589,7 @@ def test_filtering_string_boolean_and_real_boolean_fields(self): def test_filter_spaced_fields(self): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -623,7 +623,7 @@ def test_filter_spaced_fields(self): def test_filtering_bad_syntax(self): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -673,7 +673,7 @@ def run(): assert kwargs["device"] == "cuda:123" def test_search_other_types_subsearch(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -703,7 +703,7 @@ def test_search_other_types_top_search(self): "some_str": "blah" }] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -725,7 +725,7 @@ def test_search_other_types_top_search(self): def test_lexical_filtering(self): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -791,7 +791,7 @@ def test_lexical_filtering(self): def test_filter_on_id_and_more(self): """Test various filtering scenarios including _id and other conditions""" # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -850,7 +850,7 @@ def test_attributes_to_retrieve(self): (None, {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -879,7 +879,7 @@ def test_limit_results(self): batch_size_list = [50, 50, 28] # We add 128 documents to the index wth batch_size 50, 50, 28 to avoid timeout for batch_size in batch_size_list: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -962,7 +962,7 @@ def test_image_search_highlights(self): {"_id": "789", "image_field": url_2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -987,7 +987,7 @@ def test_multi_search(self): {"field_a": "Construction and scaffolding equipment", "_id": 'irrelevant_doc'} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1025,7 +1025,7 @@ def test_multi_search_images(self): "_id": 'artefact_hippo' } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1071,7 +1071,7 @@ def test_multi_search_images_invalid_queries(self): } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1081,7 +1081,7 @@ def test_multi_search_images_invalid_queries(self): ) invalid_queries = [{}, None, {123: 123}, {'123': None}, - {"https://marqo_not_real.com/image_1.png": 3}, set()] + {"https://marqo-not-real.com/image_1.png": 3}, set()] for q in invalid_queries: with self.subTest(f"query={q}"): with self.assertRaises((ValidationError, errors.InvalidArgError)) as e: @@ -1102,7 +1102,7 @@ def test_multi_search_images_edge_cases(self): "_id": 'artefact_hippo' } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1128,7 +1128,7 @@ def test_multi_search_images_lexical(self): {"field_a": "Some text about a weird forest", "_id": 'artefact_hippo'} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1163,7 +1163,7 @@ def test_image_search(self): docs = list(doc_dict.values()) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1192,7 +1192,7 @@ def test_lexical_search_no_highlights_format(self): {"_id": "2", "text_field_1": "some code", "text_field_2": "match", "int_field_1": 2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1218,7 +1218,7 @@ def test_tensor_search_highlights_format(self): {"_id": "2", "text_field_1": "some code", "text_field_2": "match", "int_field_1": 2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1256,7 +1256,7 @@ def test_filter_on_large_integer_and_float(self): # large negative float {'double_field_1': -9999999999.87675, '_id': '7', "search_field": "some text"} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1288,7 +1288,7 @@ def test_search_with_content_double_colon(self): docs = [ {"_id": "1", "text_field": "::my_text"} # This should work properly ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1336,7 +1336,7 @@ def test_search_returned_documents(self): for document, msg in [full_fields_document, partial_fields_document, no_field_documents]: with self.subTest(msg): self.clear_index_by_name(self.default_text_index) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, diff --git a/tests/tensor_search/integ_tests/test_search_structured.py b/tests/tensor_search/integ_tests/test_search_structured.py index 9a2e0f6b5..f99fa7141 100644 --- a/tests/tensor_search/integ_tests/test_search_structured.py +++ b/tests/tensor_search/integ_tests/test_search_structured.py @@ -157,7 +157,7 @@ def test_each_doc_returned_once(self): ] for index_name, desc in tests: with self.subTest(desc): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -234,7 +234,7 @@ def test_vector_search_long_query_string(self): The editor-in-chief Katharine Viner succeeded Alan Rusbridger in 2015.[10][11] Since 2018, the paper's main newsprint sections have been published in tabloid format. As of July 2021, its print edition had a daily circulation of 105,134.[4] The newspaper has an online edition, TheGuardian.com, as well as two international websites, Guardian Australia (founded in 2013) and Guardian US (founded in 2011). The paper's readership is generally on the mainstream left of British political opinion,[12][13][14][15] and the term "Guardian reader" is used to imply a stereotype of liberal, left-wing or "politically correct" views.[3] Frequent typographical errors during the age of manual typesetting led Private Eye magazine to dub the paper the "Grauniad" in the 1960s, a nickname still used occasionally by the editors for self-mockery.[16] """ - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -256,7 +256,7 @@ def test_vector_search_long_query_string(self): def test_search_edge_case(self): """We ran into bugs with this doc""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -286,7 +286,7 @@ def test_search_format(self): """Is the result formatted correctly?""" q = "Exact match hehehe" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -340,7 +340,7 @@ def test_search_format_empty(self): assert search_res["limit"] > 0 def test_result_count_validation(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -412,7 +412,7 @@ def test_result_count_validation(self): assert len(search_res['hits']) >= 1 def test_highlights_tensor(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -437,7 +437,7 @@ def test_highlights_tensor(self): assert "_highlights" not in hit def test_highlights_lexical(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -463,7 +463,7 @@ def test_highlights_lexical(self): def test_search_int_field(self): """doesn't error out if there is a random int field""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -515,7 +515,7 @@ def run(): assert kwargs["device"] == "cuda:123" def test_search_other_types_subsearch(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -544,7 +544,7 @@ def test_search_other_types_top_search(self): "text_field_1": "blah" }] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -566,7 +566,7 @@ def test_search_other_types_top_search(self): def test_lexical_filtering(self): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -684,7 +684,7 @@ def test_attributes_to_retrieve(self): # "_highlights"}), ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -708,7 +708,7 @@ def test_limit_results(self): vocab_source = "https://www.mit.edu/~ecprice/wordlist.10000" vocab = requests.get(vocab_source).text.splitlines() - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.image_index_with_random_model, @@ -795,7 +795,7 @@ def test_image_search_highlights(self): {"_id": "123", "image_field_1": url_1, "text_field_1": "irrelevant text"}, {"_id": "789", "image_field_1": url_2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -819,7 +819,7 @@ def test_multi_search(self): {"text_field_1": "Construction and scaffolding equipment", "_id": 'irrelevant_doc'} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -856,7 +856,7 @@ def test_multi_search_images(self): "image_field_2": TestImageUrls.HIPPO_STATUE.value } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -900,7 +900,7 @@ def test_multi_search_images_invalid_queries(self): } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -909,7 +909,7 @@ def test_multi_search_images_invalid_queries(self): ) invalid_queries = [{}, None, {123: 123}, {'123': None}, - {"https://marqo_not_real.com/image_1.png": 3}, set()] + {"https://marqo-not-real.com/image_1.png": 3}, set()] for q in invalid_queries: with self.subTest(f"query={q}"): with self.assertRaises((ValidationError, errors.InvalidArgError)) as e: @@ -931,7 +931,7 @@ def test_multi_search_images_edge_cases(self): "text_field_1": "Some text about a weird forest" } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -955,7 +955,7 @@ def test_multi_search_images_lexical(self): {"_id": 'realistic_hippo', "image_field_1": "124"}, {"_id": 'artefact_hippo', "text_field_1": "Some text about a weird forest"} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -983,7 +983,7 @@ def test_image_search(self): docs = list(doc_dict.values()) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1012,7 +1012,7 @@ def test_lexical_search_no_highlights_format(self): ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1039,7 +1039,7 @@ def test_tensor_search_highlights_format(self): {"_id": "2", "text_field_1": "some code", "text_field_2": "match", "int_field_1": 2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1076,7 +1076,7 @@ def test_filter_on_large_integer_and_float(self): # large negative float {'double_field_1': -9999999999.87675, '_id': '7', "text_field_1": "some text"} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1135,7 +1135,7 @@ def test_tensor_search_with_custom_vector_query(self): docs = [ {"text_field_1": "some text", "text_field_2": "Close match hehehe", "int_field_1": 1}, ] * 10 - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, diff --git a/tests/tensor_search/integ_tests/test_search_unstructured.py b/tests/tensor_search/integ_tests/test_search_unstructured.py index 37cbb0728..b7b9a46c0 100644 --- a/tests/tensor_search/integ_tests/test_search_unstructured.py +++ b/tests/tensor_search/integ_tests/test_search_unstructured.py @@ -104,8 +104,8 @@ def test_each_doc_returned_once(self): ] for index_name, desc in tests: with self.subTest(desc): - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams( + self.add_documents(config=self.config, + add_docs_params=AddDocsParams( index_name=index_name, docs=[ {"abc": "Exact match hehehe efgh ", "other_field": "baaadd efgh ", @@ -115,7 +115,7 @@ def test_each_doc_returned_once(self): ], tensor_fields=["abc", "other_field", "finally"], ) - ) + ) search_res = tensor_search._vector_text_search( config=self.config, index_name=index_name, @@ -193,7 +193,7 @@ def test_vector_search_long_query_string(self): The editor-in-chief Katharine Viner succeeded Alan Rusbridger in 2015.[10][11] Since 2018, the paper's main newsprint sections have been published in tabloid format. As of July 2021, its print edition had a daily circulation of 105,134.[4] The newspaper has an online edition, TheGuardian.com, as well as two international websites, Guardian Australia (founded in 2013) and Guardian US (founded in 2011). The paper's readership is generally on the mainstream left of British political opinion,[12][13][14][15] and the term "Guardian reader" is used to imply a stereotype of liberal, left-wing or "politically correct" views.[3] Frequent typographical errors during the age of manual typesetting led Private Eye magazine to dub the paper the "Grauniad" in the 1960s, a nickname still used occasionally by the editors for self-mockery.[16] """ - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -214,7 +214,7 @@ def test_vector_search_long_query_string(self): def test_search_edge_case(self): """We ran into bugs with this doc""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -244,7 +244,7 @@ def test_search_format(self): """Is the result formatted correctly?""" q = "Exact match hehehe" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -297,7 +297,7 @@ def test_search_format_empty(self): assert search_res["limit"] > 0 def test_result_count_validation(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -341,7 +341,7 @@ def test_result_count_validation(self): assert len(search_res['hits']) >= 1 def test_highlights_tensor(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -366,7 +366,7 @@ def test_highlights_tensor(self): assert "_highlights" not in hit def test_highlights_lexical(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, docs=[ @@ -392,7 +392,7 @@ def test_highlights_lexical(self): def test_search_int_field(self): """doesn't error out if there is a random int field""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -411,7 +411,7 @@ def test_search_int_field(self): assert len(s_res["hits"]) > 0 def test_filtering_list_case_tensor(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -453,7 +453,7 @@ def test_filtering_list_case_tensor(self): assert len(res_should_only_match_keyword_good["hits"]) == 1 def test_filtering_list_case_lexical(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -486,7 +486,7 @@ def test_filtering_list_case_lexical(self): def test_filtering_list_case_image(self): hippo_img = TestImageUrls.HIPPO_REALISTIC.value - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -519,7 +519,7 @@ def test_filtering_list_case_image(self): def test_filtering(self): # TODO-Li Add support for filter on Bool # Add documents first (assuming add_docs_caller is a method to add documents) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -565,7 +565,7 @@ def test_filtering_string_boolean_and_real_boolean_fields(self): "bool_field_1": False, "bool_field_2": True, "text_field_3": "search me"}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -604,7 +604,7 @@ def test_filtering_string_boolean_and_real_boolean_fields(self): def test_filter_spaced_fields(self): # Add documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -638,7 +638,7 @@ def test_filter_spaced_fields(self): def test_filtering_bad_syntax(self): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -688,7 +688,7 @@ def run(): assert kwargs["device"] == "cuda:123" def test_search_other_types_subsearch(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -718,7 +718,7 @@ def test_search_other_types_top_search(self): "some_str": "blah" }] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -740,7 +740,7 @@ def test_search_other_types_top_search(self): def test_lexical_filtering(self): # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -806,7 +806,7 @@ def test_lexical_filtering(self): def test_filter_on_id_and_more(self): """Test various filtering scenarios including _id and other conditions""" # Adding documents - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -865,7 +865,7 @@ def test_attributes_to_retrieve(self): (None, {"field_1", "field_2", "random_field", "random_lala", "marqomarqo", "_id", "_score", "_highlights"}), ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -894,7 +894,7 @@ def test_limit_results(self): batch_size_list = [50, 50, 28] # We add 128 documents to the index wth batch_size 50, 50, 28 to avoid timeout for batch_size in batch_size_list: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -977,7 +977,7 @@ def test_image_search_highlights(self): {"_id": "789", "image_field": url_2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1002,7 +1002,7 @@ def test_multi_search(self): {"field_a": "Construction and scaffolding equipment", "_id": 'irrelevant_doc'} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1040,7 +1040,7 @@ def test_multi_search_images(self): "_id": 'artefact_hippo' } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1086,7 +1086,7 @@ def test_multi_search_images_invalid_queries(self): } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1096,7 +1096,7 @@ def test_multi_search_images_invalid_queries(self): ) invalid_queries = [{}, None, {123: 123}, {'123': None}, - {"https://marqo_not_real.com/image_1.png": 3}, set()] + {"https://marqo-not-real.com/image_1.png": 3}, set()] for q in invalid_queries: with self.subTest(f"query={q}"): with self.assertRaises((ValidationError, errors.InvalidArgError)) as e: @@ -1117,7 +1117,7 @@ def test_multi_search_images_edge_cases(self): "_id": 'artefact_hippo' } ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1143,7 +1143,7 @@ def test_multi_search_images_lexical(self): {"field_a": "Some text about a weird forest", "_id": 'artefact_hippo'} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1178,7 +1178,7 @@ def test_image_search(self): docs = list(doc_dict.values()) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_image_index, @@ -1207,7 +1207,7 @@ def test_lexical_search_no_highlights_format(self): {"_id": "2", "text_field_1": "some code", "text_field_2": "match", "int_field_1": 2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1233,7 +1233,7 @@ def test_tensor_search_highlights_format(self): {"_id": "2", "text_field_1": "some code", "text_field_2": "match", "int_field_1": 2}, ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1271,7 +1271,7 @@ def test_filter_on_large_integer_and_float(self): # large negative float {'double_field_1': -9999999999.87675, '_id': '7', "search_field": "some text"} ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1303,7 +1303,7 @@ def test_search_with_content_double_colon(self): docs = [ {"_id": "1", "text_field": "::my_text"} # This should work properly ] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, @@ -1351,7 +1351,7 @@ def test_search_returned_documents(self): for document, msg in [full_fields_document, partial_fields_document, no_field_documents]: with self.subTest(msg): self.clear_index_by_name(self.default_text_index) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.default_text_index, diff --git a/tests/tensor_search/test_add_documents_use_existing_tensors.py b/tests/tensor_search/test_add_documents_use_existing_tensors.py index 6da1a28b2..cd9ea8e88 100644 --- a/tests/tensor_search/test_add_documents_use_existing_tensors.py +++ b/tests/tensor_search/test_add_documents_use_existing_tensors.py @@ -71,7 +71,7 @@ def test_use_existing_tensor_no_change(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -85,7 +85,7 @@ def test_use_existing_tensor_no_change(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -125,7 +125,7 @@ def test_use_existing_tensor_new_fields(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -139,7 +139,7 @@ def test_use_existing_tensor_new_fields(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -183,7 +183,7 @@ def test_use_existing_tensor_multimodal_no_change(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -198,7 +198,7 @@ def test_use_existing_tensor_multimodal_no_change(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -249,7 +249,7 @@ def test_use_existing_tensor_multimodal_added(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -262,7 +262,7 @@ def test_use_existing_tensor_multimodal_added(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -314,7 +314,7 @@ def test_use_existing_tensor_multimodal_changed(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -329,7 +329,7 @@ def test_use_existing_tensor_multimodal_changed(self): with mock.patch.object(s2_inference, 'vectorise', side_effect=original_vectorise) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, @@ -361,7 +361,7 @@ def test_use_existing_tensors_resilience(self): "desc 2": "content 2. blah blah blah" } # 1 valid ID doc: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[d1, {'_id': 1224}, {"_id": "fork", "abc": "123"}], auto_refresh=True, use_existing_tensors=True, @@ -369,7 +369,7 @@ def test_use_existing_tensors_resilience(self): assert [item['status'] for item in res['items']] == [201, 400, 201] # no valid IDs - res_no_valid_id = self.add_documents_and_refresh_index( + res_no_valid_id = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[d1, {'_id': 1224}, d1], auto_refresh=True, use_existing_tensors=True, device="cpu")) @@ -384,11 +384,11 @@ def test_use_existing_tensors_no_id(self): "title 1": "content 1", "desc 2": "content 2. blah blah blah" } - r1 = self.add_documents_and_refresh_index( + r1 = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[d1], auto_refresh=True, use_existing_tensors=True, device="cpu")) - r2 = self.add_documents_and_refresh_index( + r2 = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[d1, d1], auto_refresh=True, use_existing_tensors=True, device="cpu")) @@ -403,7 +403,7 @@ def test_use_existing_tensors_non_existing(self): """check parity between a doc created with and without use_existing_tensors, then overwritten, for a newly created doc. """ - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -418,7 +418,7 @@ def test_use_existing_tensors_non_existing(self): tensor_search.delete_index(config=self.config, index_name=self.index_name_1) tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -430,7 +430,7 @@ def test_use_existing_tensors_non_existing(self): document_id="123", show_vectors=True) self.assertEqual(use_existing_tensors_doc, regular_doc) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -449,7 +449,7 @@ def test_use_existing_tensors_dupe_ids(self): Should only use the latest inserted ID. Make sure it doesn't get the first/middle one """ - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "3", @@ -464,7 +464,7 @@ def test_use_existing_tensors_dupe_ids(self): tensor_search.delete_index(config=self.config, index_name=self.index_name_1) tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "1", @@ -490,7 +490,7 @@ def test_use_existing_tensors_dupe_ids(self): self.assertEqual(doc_3_solo, doc_3_duped) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "1", @@ -526,7 +526,7 @@ def test_use_existing_tensors_retensorize_fields(self): They should still have no tensors. """ - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -541,7 +541,7 @@ def test_use_existing_tensors_retensorize_fields(self): document_id="123", show_vectors=True) assert len(d1["_tensor_facets"]) == 0 - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -563,7 +563,7 @@ def test_use_existing_tensors_getting_non_tensorised(self): When we insert the doc again, with use_existing_tensors, because the content hasn't changed, we use the existing (non-existent) vectors """ - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -576,7 +576,7 @@ def test_use_existing_tensors_getting_non_tensorised(self): assert len(d1["_tensor_facets"]) == 1 assert "title 1" in d1["_tensor_facets"][0] - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -589,7 +589,7 @@ def test_use_existing_tensors_getting_non_tensorised(self): self.assertEqual(d1["_tensor_facets"], d2["_tensor_facets"]) # The only field is a non-tensor field. This makes a chunkless doc. - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "999", @@ -600,7 +600,7 @@ def test_use_existing_tensors_getting_non_tensorised(self): document_id="999", show_vectors=True) assert len(d1["_tensor_facets"]) == 0 - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "999", @@ -615,7 +615,7 @@ def test_use_existing_tensors_getting_non_tensorised(self): def test_use_existing_tensors_check_updates(self): """ Check to see if the document has been appropriately updated """ - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -635,7 +635,7 @@ def pass_through_vectorise(*arg, **kwargs): @unittest.mock.patch("marqo.s2_inference.s2_inference.vectorise", mock_vectorise) def run(): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -661,7 +661,7 @@ def test_use_existing_tensors_check_meta_data(self): Checks chunk meta data and vectors are as expected """ - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -685,7 +685,7 @@ def test_use_existing_tensors_check_meta_data(self): "fl": 101.3, "new_bool": False } - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{"_id": "123", **use_existing_tensor_doc}], auto_refresh=True, non_tensor_fields=["2nd-non-tensor-field", "field_to_be_list", 'new_field_list'], @@ -714,7 +714,7 @@ def test_use_existing_tensors_check_meta_data(self): @unittest.skip def test_use_existing_tensors_check_meta_data_mappings(self): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "123", @@ -738,7 +738,7 @@ def test_use_existing_tensors_check_meta_data_mappings(self): "fl": 101.3, "new_bool": False } - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"_id": "123", **use_existing_tensor_doc}], auto_refresh=True, @@ -781,7 +781,7 @@ def test_use_existing_tensors_long_strings_and_images(self): index_name=self.index_name_2, index_settings=index_settings, config=self.config) hippo_img = TestImageUrls.HIPPO_REALISTIC.value artefact_hippo_img = TestImageUrls.HIPPO_STATUE.value - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_2, docs=[ { "_id": "123", @@ -820,7 +820,7 @@ def run(): "fl": 3.5, "non-tensor-field": ["it", "is", "9", "o clock"] } - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_2, docs=[{"_id": "123", **use_existing_tensor_doc}], auto_refresh=True, @@ -900,7 +900,7 @@ def test_use_existing_tensors_all_data_types(self): for doc_arg in doc_args: # Add doc normally without use_existing_tensors - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=doc_arg, auto_refresh=True, device="cpu")) @@ -909,7 +909,7 @@ def test_use_existing_tensors_all_data_types(self): document_ids=[doc["_id"] for doc in doc_arg], show_vectors=True) # Then replace doc with use_existing_tensors - add_res = self.add_documents_and_refresh_index( + add_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=doc_arg, auto_refresh=True, use_existing_tensors=True, device="cpu")) diff --git a/tests/tensor_search/test_context_vectors_search.py b/tests/tensor_search/test_context_vectors_search.py index 8a28438f8..e3163ba69 100644 --- a/tests/tensor_search/test_context_vectors_search.py +++ b/tests/tensor_search/test_context_vectors_search.py @@ -90,12 +90,12 @@ def test_search_score(self): """Test to ensure that the score is the same for the same query with different context vectors combinations.""" for index_name in [self.structured_index_with_random_model, self.unstructured_index_with_random_model]: tensor_fields = ["text_field_1"] if index_name == self.unstructured_index_with_random_model else None - self.add_documents_and_refresh_index(config=self.config, add_docs_params= + self.add_documents(config=self.config, add_docs_params= AddDocsParams(index_name=index_name, docs=[{"text_field_1": "A rider", "_id": "1"}], tensor_fields=tensor_fields ) - ) + ) with self.subTest(msg=index_name): query = { "A rider is riding a horse jumping over the barrier": 1, diff --git a/tests/tensor_search/test_default_device.py b/tests/tensor_search/test_default_device.py index 92af2885f..15e8e94a2 100644 --- a/tests/tensor_search/test_default_device.py +++ b/tests/tensor_search/test_default_device.py @@ -98,7 +98,7 @@ def test_add_documents_defaults_to_best_available_device(self): with patch.dict("marqo.tensor_search.utils.os.environ", {EnvVars.MARQO_BEST_AVAILABLE_DEVICE: best_available_device}),\ patch("marqo.tensor_search.utils.check_device_is_available", return_value=True) as mock_check_device_is_available,\ patch("marqo.s2_inference.s2_inference.vectorise", return_value=dummy_vector) as mock_vectorise: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(**AddDocsParams_kwargs) ) @@ -119,7 +119,7 @@ def test_add_documents_fails_with_no_default(self): for AddDocsParams_kwargs in AddDocsParams_kwargs_list: try: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(**AddDocsParams_kwargs) ) @@ -139,7 +139,7 @@ def test_add_document_uses_set_device(self): for explicitly_set_device in devices_list: with patch("marqo.s2_inference.s2_inference.vectorise", return_value=dummy_vector) as mock_vectorise,\ patch("marqo.tensor_search.models.add_docs_objects.get_best_available_device") as mock_get_best_available_device: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"Title": "blah"} for _ in range(5)], @@ -171,9 +171,9 @@ def run(): mocks = [patcher.start() for patcher in patchers] # Add docs - self.add_documents_and_refresh_index(config=self.config, add_docs_params = AddDocsParams( + self.add_documents(config=self.config, add_docs_params = AddDocsParams( auto_refresh=True, device="cpu", index_name=self.index_name_1, docs=[{"test": "blah"}]) - ) + ) # Call search tensor_search.search( @@ -215,9 +215,9 @@ def run(): mocks = [patcher.start() for patcher in patchers] # Add docs - self.add_documents_and_refresh_index(config=self.config, add_docs_params = AddDocsParams( + self.add_documents(config=self.config, add_docs_params = AddDocsParams( auto_refresh=True, device="cpu", index_name=self.index_name_1, docs=[{"test": "blah"}]) - ) + ) # Call search tensor_search.search( @@ -246,9 +246,9 @@ def test_search_fails_with_no_default(self): """ self.assertNotIn("MARQO_BEST_AVAILABLE_DEVICE", os.environ) # Add docs - self.add_documents_and_refresh_index(config=self.config, add_docs_params = AddDocsParams( + self.add_documents(config=self.config, add_docs_params = AddDocsParams( auto_refresh=True, device="cpu", index_name=self.index_name_1, docs=[{"test": "blah"}]) - ) + ) try: # Call search @@ -287,12 +287,12 @@ def run(): mock_obj.return_value = self.mock_bulk_vector_text_search_results # Add docs - self.add_documents_and_refresh_index(config=self.config, add_docs_params = AddDocsParams( + self.add_documents(config=self.config, add_docs_params = AddDocsParams( auto_refresh=True, device="cpu", index_name=self.index_name_1, docs=[ {"abc": "Exact match hehehe", "other field": "baaadd", "_id": "id1-first"}, {"abc": "random text", "other field": "Close match hehehe", "_id": "id1-second"} ]) - ) + ) # Call bulk search tensor_search.bulk_search( @@ -351,12 +351,12 @@ def run(): mock_obj.return_value = self.mock_bulk_vector_text_search_results # Add docs - self.add_documents_and_refresh_index(config=self.config, add_docs_params = AddDocsParams( + self.add_documents(config=self.config, add_docs_params = AddDocsParams( auto_refresh=True, device="cpu", index_name=self.index_name_1, docs=[ {"abc": "Exact match hehehe", "other field": "baaadd", "_id": "id1-first"}, {"abc": "random text", "other field": "Close match hehehe", "_id": "id1-second"} ]) - ) + ) # Call bulk search tensor_search.bulk_search( @@ -398,9 +398,9 @@ def test_bulk_search_fails_with_no_default(self): """ self.assertNotIn("MARQO_BEST_AVAILABLE_DEVICE", os.environ) # Add docs - self.add_documents_and_refresh_index(config=self.config, add_docs_params = AddDocsParams( + self.add_documents(config=self.config, add_docs_params = AddDocsParams( auto_refresh=True, device="cpu", index_name=self.index_name_1, docs=[{"test": "blah"}]) - ) + ) try: # Call bulk search diff --git a/tests/tensor_search/test_image_download_headers.py b/tests/tensor_search/test_image_download_headers.py index 0e058e912..ea692be9e 100644 --- a/tests/tensor_search/test_image_download_headers.py +++ b/tests/tensor_search/test_image_download_headers.py @@ -63,7 +63,7 @@ def test_img_download_search(self): config=self.config, index_name=self.index_name_1, index_settings=self.image_index_settings() ) image_download_headers = {"Authorization": "some secret key blah"} - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "1", "image": self.real_img_url}], auto_refresh=True, image_download_headers=image_download_headers, device="cpu")) @@ -105,7 +105,7 @@ def run(): image_download_headers = {"Authorization": "some secret key blah"} # Add a document with an image URL - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "1", "image": self.real_img_url} ], auto_refresh=True, image_download_headers=image_download_headers, device="cpu" @@ -138,7 +138,7 @@ def pass_through_requests_get(url, *args, **kwargs): mock_load_image_from_path.side_effect = pass_through_load_image_from_path with unittest.mock.patch("marqo.s2_inference.clip_utils.load_image_from_path", mock_load_image_from_path): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ { "_id": "1", diff --git a/tests/tensor_search/test_image_preprocessing.py b/tests/tensor_search/test_image_preprocessing.py index 91a1bd298..3ecef81da 100644 --- a/tests/tensor_search/test_image_preprocessing.py +++ b/tests/tensor_search/test_image_preprocessing.py @@ -52,8 +52,8 @@ def test_image_preprocess_search_highlights_format(self): tensor_fields = None if index_name == self.structured_image_index else ["image_field_1"] with self.subTest(f"index_name = {index_name}"): - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams(index_name=index_name, + self.add_documents(config=self.config, + add_docs_params=AddDocsParams(index_name=index_name, docs=documents, tensor_fields=tensor_fields)) search_result = tensor_search.search(config=self.config, @@ -74,8 +74,8 @@ def test_image_preprocess_get_documents_format(self): tensor_fields = None if index_name == self.structured_image_index else ["image_field_1"] with self.subTest(f"index_name = {index_name}"): - self.add_documents_and_refresh_index(config=self.config, - add_docs_params=AddDocsParams(index_name=index_name, + self.add_documents(config=self.config, + add_docs_params=AddDocsParams(index_name=index_name, docs=documents, tensor_fields=tensor_fields)) get_doc_result = tensor_search.get_document_by_id(config=self.config, diff --git a/tests/tensor_search/test_index_meta_cache.py b/tests/tensor_search/test_index_meta_cache.py index b30321281..636aeb2cf 100644 --- a/tests/tensor_search/test_index_meta_cache.py +++ b/tests/tensor_search/test_index_meta_cache.py @@ -87,11 +87,11 @@ def test_search_works_on_cache_clear(self): assert self.index_name_3 in index_meta_cache.get_cache() def test_add_new_fields_preserves_index_cache(self): - add_doc_res_1 = self.add_documents_and_refresh_index( + add_doc_res_1 = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"abc": "def"}], auto_refresh=True, device="cpu") ) - add_doc_res_2 = self.add_documents_and_refresh_index( + add_doc_res_2 = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{"cool field": "yep yep", "haha": "heheh"}], auto_refresh=True, device="cpu" @@ -100,7 +100,7 @@ def test_add_new_fields_preserves_index_cache(self): index_info_t0 = index_meta_cache.get_cache()[self.index_name_1] # reset cache: index_meta_cache.empty_cache() - add_doc_res_3 = self.add_documents_and_refresh_index( + add_doc_res_3 = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{"newer field": "ndewr content", "goblin": "paradise"}], @@ -119,12 +119,12 @@ def test_add_new_fields_preserves_index_cache(self): def test_delete_removes_index_from_cache(self): """note the implicit index creation""" - add_doc_res_1 = self.add_documents_and_refresh_index( + add_doc_res_1 = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{"abc": "def"}], auto_refresh=True, device="cpu" ) ) - add_doc_res_2 = self.add_documents_and_refresh_index( + add_doc_res_2 = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_2, docs=[{"abc": "def"}], auto_refresh=True, device="cpu" ) @@ -147,7 +147,7 @@ def test_lexical_search_caching(self): } d1 = {"some doc 1": "some 2 marqo", "field abc": "robodog is not a cat", "_id": "Jupyter_12"} d2 = {"exclude me": "marqo"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d0, d1, d2], device="cpu") ) @@ -167,7 +167,7 @@ def test_get_documents_caching(self): } d1 = {"some doc 1": "some 2 marqo", "field abc": "robodog is not a cat", "_id": "Jupyter_12"} d2 = {"exclude me": "marqo"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d0, d1, d2 ], device="cpu") @@ -201,7 +201,7 @@ def _simulate_externally_added_docs(self, index_name, docs, check_only_in_extern # save the state of the cache: cache_t0 = copy.deepcopy(index_meta_cache.get_cache()) # mock external party indexing something: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=index_name, docs=docs, auto_refresh=True, device="cpu")) @@ -227,7 +227,7 @@ def test_search_lexical_externally_created_field(self): """ search (search_method=SearchMethod.lexical) after the first cache hit is empty, it should be updated. """ - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) self._simulate_externally_added_docs( @@ -246,7 +246,7 @@ def test_search_lexical_externally_created_field(self): def test_search_vectors_externally_created_field(self): """ search (search_method=SearchMethod.chunk_embeddings) """ - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) self._simulate_externally_added_docs( @@ -263,7 +263,7 @@ def test_search_vectors_externally_created_field(self): assert result_2["hits"][0]["_id"] == "1234" def test_search_vectors_externally_created_field_attributes(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) self._simulate_externally_added_docs( @@ -281,7 +281,7 @@ def test_search_lexical_externally_created_field_attributes(self): index_meta_cache.empty_cache() tensor_search.create_vector_index( config=self.config, index_name=self.index_name_3) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) self._simulate_externally_added_docs( @@ -299,7 +299,7 @@ def test_search_lexical_externally_created_field_attributes(self): assert result_2["hits"][0]["_id"] == "1234" def test_vector_search_non_existent_field(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) assert "brand new field" not in index_meta_cache.get_cache()[self.index_name_1].properties @@ -311,7 +311,7 @@ def test_vector_search_non_existent_field(self): def test_lexical_search_non_existent_field(self): """""" - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) assert "brand new field" not in index_meta_cache.get_cache()[self.index_name_1].properties @@ -326,7 +326,7 @@ def test_cache_update_on_search(self): The cache should update after search With single KNN field, doc should be found even if field is not in cache """ - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some field": "Plane 1"}], auto_refresh=True, device="cpu")) time.sleep(2.5) @@ -560,7 +560,7 @@ def test_search_index_refresh_on_interval_multi_threaded(self): # we need to search it once, to get something in the cache, otherwise # the threads will see an empty cache and try to fill it try: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{"hi": "hello"}], auto_refresh=False, device="cpu")) @@ -632,7 +632,7 @@ def run(): index_settings={"index_defaults": {"model": "random"}}) clear_cache_thread = threading.Thread(target=clear_cache) clear_cache_thread.start() - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( **{ @@ -672,7 +672,7 @@ def run(): clear_cache_thread = threading.Thread(target=delete_index) clear_cache_thread.start() try: - self.add_documents_and_refresh_index( + self.add_documents( **{"config": self.config}, add_docs_params=AddDocsParams( **{ diff --git a/tests/tensor_search/test_lexical_search.py b/tests/tensor_search/test_lexical_search.py index 3285f4d62..1d6e603e4 100644 --- a/tests/tensor_search/test_lexical_search.py +++ b/tests/tensor_search/test_lexical_search.py @@ -44,7 +44,7 @@ def strip_marqo_fields(doc, strip_id=False): def test_lexical_search_empty_text(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some doc 1": "some field 2", "some doc 2": "some other thing"}], auto_refresh=True, device="cpu") ) @@ -53,7 +53,7 @@ def test_lexical_search_empty_text(self): assert res["hits"] == [] def test_lexical_search_bad_text_type(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"some doc 1": "some field 2", "some doc 2": "some other thing"}], auto_refresh=True, device="cpu")) bad_args = [None, 1234, 1.0] @@ -76,7 +76,7 @@ def test_lexical_search_multiple(self): "the big field": "very unlikely theory. marqo is pretty awesom, in the field" } d1 = {"title": "Marqo", "some doc 2": "some other thing", "_id": "abcdef"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d1, @@ -101,10 +101,10 @@ def test_lexical_search_single_searchable_attribs(self): "_id": "122"} d4 = {"Lucy": "Travis", "field lambda": "there is a whole bunch of text here. " "Just a slight mention of a field", "_id": "123"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d0, d4, d1 ], device="cpu")) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d3, d2], device="cpu")) res = tensor_search._lexical_search( @@ -127,11 +127,11 @@ def test_lexical_search_multiple_searchable_attribs(self): "field lambda": "some prop called marqo. This actually has a lot more content than you thought." } d4 = {"Lucy": "Travis", "field lambda": "there is a whole bunch of text here. " "Just a slight mention of a field", "_id": "123"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d0, d4, d1], device="cpu")) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d3, d2], device="cpu") ) @@ -157,7 +157,7 @@ def test_lexical_search_result_count(self): "Another bunch of words that may mean something. " "Just a slight mention of a field"} d5 = {"some completely irrelevant": "document hehehe"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d0, d4, d1, d3, d2], device="cpu")) r1 = tensor_search._lexical_search( @@ -191,7 +191,7 @@ def test_search_lexical_param(self): d4 = {"Lucy": "Travis", "field lambda": "there is a whole bunch of text here. " "Just a slight mention of a field"} d5 = {"some completely irrelevant": "document hehehe"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d0, d4, d1, d3, d2], device="cpu")) res_lexical_search = tensor_search._lexical_search( @@ -225,7 +225,7 @@ def test_lexical_search_overwriting_doc(self): "the big field": "just your average doc...", "Cool field": "Marqo is the best!" } - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d0], device="cpu")) assert [] == tensor_search._lexical_search( @@ -235,7 +235,7 @@ def test_lexical_search_overwriting_doc(self): assert len (grey_query["hits"]) == 1 assert grey_query["hits"][0]["_id"] == a_consistent_id # update doc so it does indeed get returned - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d1], device="cpu")) cool_query = tensor_search._lexical_search( @@ -257,10 +257,10 @@ def test_lexical_search_filter(self): d4 = {"Lucy": "Travis", "field lambda": "there is a whole bunch of text here. " "Just a slight mention of a field", "day": 190, "_id": "123"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d0, d4, d1 ], device="cpu")) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, auto_refresh=True, docs=[d3, d2], device="cpu")) res = tensor_search._lexical_search( @@ -280,7 +280,7 @@ def test_lexical_search_empty_searchable_attribs(self): d1 = {"title": "Marqo", "some doc 2": "some other thing", "_id": "abcdef"} d2 = {"some doc 1": "some 2 jnkerkbj", "field abc": "extravagant robodog is not a cat", "_id": "Jupyter_12"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d0, d1, d2], device="cpu") @@ -344,7 +344,7 @@ def test_lexical_search_double_quotes(self): ] fields = ["Field 1", "Field 2", "Field 3"] - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=docs, auto_refresh=False, device="cpu") ) @@ -405,7 +405,7 @@ def test_lexical_search_double_quotes(self): assert len(id_only_hits) == 0 def test_lexical_search_list(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"abc": "some text", "other field": "baaadd", "_id": "5678", "my_string": "b"}, @@ -440,7 +440,7 @@ def test_lexical_search_list(self): assert res_filtered_other_list['hits'][0]['_id'] == '1001' def test_lexical_search_list_searchable_attr(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"abc": "some text", "other field": "baaadd", "_id": "5678", "my_string": "b"}, @@ -463,7 +463,7 @@ def test_lexical_search_list_searchable_attr(self): assert len(res_wrong_attr['hits']) == 0 def test_lexical_search_filter_with_dot(self): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[ {"content": "a man on a horse", "filename" : "Important_File_1.pdf", "_id":"123"}, {"content": "the horse is eating grass", "filename": "Important_File_2.pdf", "_id": "456"}, diff --git a/tests/tensor_search/test_model_auth.py b/tests/tensor_search/test_model_auth.py index 52f952551..2d3a9302f 100644 --- a/tests/tensor_search/test_model_auth.py +++ b/tests/tensor_search/test_model_auth.py @@ -123,7 +123,7 @@ def setUpClass(cls) -> None: with unittest.mock.patch('boto3.client', return_value=mock_s3_client) as mock_boto3_client: # Call the function that uses the generate_presigned_url method - res = cls.add_documents_and_refresh_index(config=cls.config, add_docs_params=AddDocsParams( + res = cls.add_documents(config=cls.config, add_docs_params=AddDocsParams( index_name=cls.index_name_1, docs=[{'a': 'b'}], model_auth=ModelAuth( s3=S3Auth(aws_access_key_id=cls.fake_access_key_id, aws_secret_access_key=cls.fake_secret_key)), @@ -161,7 +161,7 @@ def tearDown(self): def test_after_downloading_auth_doesnt_matter(self): """on this instance, at least""" - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[{'c': 'd'}], device="cpu" )) assert not res['errors'] @@ -173,7 +173,7 @@ def test_after_downloading_doesnt_redownload(self): assert not any([m['model_name'] == 'my_model' for m in mods]) mock_req = mock.MagicMock() with mock.patch('urllib.request.urlopen', mock_req): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'c': 'd'}], device="cpu" )) assert not res['errors'] @@ -255,7 +255,7 @@ def test_model_auth_hf(self): with unittest.mock.patch('open_clip.create_model_and_transforms', mock_open_clip_creat_model): with unittest.mock.patch('marqo.s2_inference.model_downloading.from_hf.hf_hub_download', mock_hf_hub_download): try: - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], model_auth=ModelAuth(hf=HfAuth(token=hf_token)), device="cpu")) except BadRequestError as e: @@ -478,7 +478,7 @@ def test_model_loads_from_all_add_docs_derivatives(self): with unittest.mock.patch( 'marqo.s2_inference.processing.custom_clip_utils.download_pretrained_from_url' ) as mock_download_pretrained_from_url: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, @@ -614,7 +614,7 @@ def test_model_loads_from_multimodal_combination(self): aws_access_key_id=fake_access_key_id, aws_secret_access_key=fake_secret_key) ) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, @@ -691,7 +691,7 @@ def test_no_creds_error(self): with unittest.mock.patch('boto3.client', return_value=mock_s3_client): with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], device="cpu" @@ -738,7 +738,7 @@ def test_bad_creds_error_s3(self): self.assertIn("403 error when trying to retrieve model from s3", str(cm.exception)) with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], model_auth=model_auth, device="cpu" @@ -780,7 +780,7 @@ def test_non_existent_hf_location(self): self.assertIn("Could not find the specified Hugging Face model repository.", str(cm.exception)) with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], model_auth=model_auth, device="cpu" @@ -823,7 +823,7 @@ def test_bad_creds_error_hf(self): self.assertIn("Could not find the specified Hugging Face model repository.", str(cm.exception)) with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], model_auth=model_auth, device="cpu" @@ -1122,7 +1122,7 @@ def tearDown(self): def test_after_downloading_auth_doesnt_matter(self): """on this instance, at least""" - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'c': 'd'}], device="cpu" )) assert not res['errors'] @@ -1134,7 +1134,7 @@ def test_after_downloading_doesnt_redownload(self): assert not any([m['model_name'] == 'my_model' for m in mods]) mock_req = mock.MagicMock() with mock.patch('urllib.request.urlopen', mock_req): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'c': 'd'}], device="cpu" )) assert not res['errors'] @@ -1570,7 +1570,7 @@ def test_1_load_model_from_hf_zip_file_with_auth_add_documents(self): with unittest.mock.patch('marqo.s2_inference.model_downloading.from_hf.hf_hub_download', mock_hf_hub_download): with unittest.mock.patch("marqo.s2_inference.hf_utils.extract_huggingface_archive", mock_extract_huggingface_archive): try: - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], model_auth=ModelAuth(hf=HfAuth(token=hf_token)), device="cpu")) except KeyError as e: @@ -1630,7 +1630,7 @@ def test_2_load_model_from_hf_zip_file_without_auth_add_documents(self): with unittest.mock.patch('marqo.s2_inference.model_downloading.from_hf.hf_hub_download', mock_hf_hub_download): with unittest.mock.patch("marqo.s2_inference.hf_utils.extract_huggingface_archive", mock_extract_huggingface_archive): try: - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], device="cpu")) except KeyError as e: # KeyError as this is not a real model. It does not have an attention_mask @@ -1702,7 +1702,7 @@ def test_3_load_model_from_s3_zip_file_with_auth_add_documents(self): with unittest.mock.patch("marqo.s2_inference.hf_utils.extract_huggingface_archive", mock_extract_huggingface_archive): try: - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], model_auth=ModelAuth(s3=S3Auth(aws_access_key_id=fake_access_key_id, aws_secret_access_key=fake_secret_key)), @@ -1757,7 +1757,7 @@ def test_4_load_model_from_public_url_zip_file_add_documents(self): with mock.patch('transformers.AutoModel.from_pretrained', new=mock_automodel_from_pretrained): with mock.patch('marqo.s2_inference.processing.custom_clip_utils.download_pretrained_from_url', new=mock_download): with mock.patch("marqo.s2_inference.hf_utils.extract_huggingface_archive", new=mock_extract_huggingface_archive): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], device="cpu")) assert len(mock_extract_huggingface_archive.call_args_list) == 1 @@ -1797,7 +1797,7 @@ def test_5_load_model_from_private_hf_repo_with_auth_add_documents(self): with unittest.mock.patch("transformers.AutoModel.from_pretrained", mock_automodel_from_pretrained): with unittest.mock.patch("transformers.AutoTokenizer.from_pretrained", mock_autotokenizer_from_pretrained): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], model_auth=ModelAuth(hf=HfAuth(token=hf_token)), device="cpu")) @@ -1868,7 +1868,7 @@ def test_62_load_model_from_public_hf_repo_without_auth_using_name_add_documents mock_automodel_from_pretrained = mock.MagicMock(side_effect=AutoModel.from_pretrained) with mock.patch('transformers.AutoModel.from_pretrained', new=mock_automodel_from_pretrained): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'a': 'b'}], device="cpu")) mock_automodel_from_pretrained.assert_called_once_with( @@ -2073,7 +2073,7 @@ def test_model_loads_from_all_add_docs_derivatives(self): with unittest.mock.patch( 'marqo.s2_inference.processing.custom_clip_utils.download_pretrained_from_url' ) as mock_download_pretrained_from_url: - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, @@ -2195,7 +2195,7 @@ def test_model_loads_from_multimodal_combination(self): aws_access_key_id=fake_access_key_id, aws_secret_access_key=fake_secret_key) ) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, @@ -2268,7 +2268,7 @@ def test_no_creds_error(self): with unittest.mock.patch('boto3.client', return_value=mock_s3_client): with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], device="cpu" @@ -2313,7 +2313,7 @@ def test_bad_creds_error_s3(self): self.assertIn("403 error when trying to retrieve model from s3", str(cm.exception)) with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], model_auth=model_auth, device="cpu" @@ -2354,7 +2354,7 @@ def test_non_existent_hf_location(self): self.assertIn("Could not find the specified Hugging Face model repository.", str(cm.exception)) with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], model_auth=model_auth, device="cpu" @@ -2396,7 +2396,7 @@ def test_bad_creds_error_hf(self): self.assertIn("Could not find the specified Hugging Face model repository.", str(cm.exception)) with self.assertRaises(BadRequestError) as cm2: - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'title': 'blah blah'}], model_auth=model_auth, device="cpu" diff --git a/tests/tensor_search/test_model_auth_cuda.py b/tests/tensor_search/test_model_auth_cuda.py index 68face585..019d66ff2 100644 --- a/tests/tensor_search/test_model_auth_cuda.py +++ b/tests/tensor_search/test_model_auth_cuda.py @@ -113,7 +113,7 @@ def tearDownClass(cls) -> None: def test_after_downloading_auth_doesnt_matter(self): """on this instance, at least""" - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'c': 'd'}], device=self.device )) assert not res['errors'] @@ -123,7 +123,7 @@ def test_after_downloading_doesnt_redownload(self): tensor_search.eject_model(model_name=self.custom_model_name, device=self.device) mock_req = mock.MagicMock() with mock.patch('urllib.request.urlopen', mock_req): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[{'c': 'd'}], device=self.device )) diff --git a/tests/tensor_search/test_multimodal_tensor_combination.py b/tests/tensor_search/test_multimodal_tensor_combination.py index eef72bbe1..8445946ae 100644 --- a/tests/tensor_search/test_multimodal_tensor_combination.py +++ b/tests/tensor_search/test_multimodal_tensor_combination.py @@ -226,12 +226,12 @@ def test_add_documents_with_one_multimodal_fields(self): } } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[doc, ], mappings=mappings if isinstance(index, UnstructuredMarqoIndex) else None, device="cpu", tensor_fields=["combo_text_image"] if isinstance(index, UnstructuredMarqoIndex) else None), - ) + ) added_doc = tensor_search.get_document_by_id(config=self.config, index_name=index.name, document_id="1", show_vectors=True) for key, value in doc.items(): @@ -279,7 +279,7 @@ def test_add_documents_with_multiple_multimodal_fields(self): } } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[doc, ], mappings=mappings if isinstance(index, UnstructuredMarqoIndex) else None, device="cpu", @@ -319,7 +319,7 @@ def test_get_document_by_id_return_multimodal_params_logic(self): } } - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[doc, ], mappings=mappings if isinstance(index, UnstructuredMarqoIndex) else None, device="cpu", @@ -389,12 +389,12 @@ def test_multimodal_fields_correct_number_of_vectors(self): for mappings, tensor_fields, number_of_documents, number_of_vectors in test_cases: with self.subTest(f"{mappings}, {tensor_fields}"): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_random_multimodal_index.name, docs=doc, mappings=mappings, device="cpu", tensor_fields=tensor_fields), - ) + ) res = self.monitoring.get_index_stats_by_name(index_name=self.unstructured_random_multimodal_index.name) self.assertEqual(number_of_documents, res.number_of_documents) @@ -431,7 +431,7 @@ def test_multimodal_field_bad_field_content(self): error_msg = error_msg_unstructured if isinstance(index, UnstructuredMarqoIndex) else error_msg_structured with self.subTest(error_msg): with mock.patch("marqo.s2_inference.s2_inference.vectorise") as mock_vectorise: - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[document, ], mappings=mappings if isinstance(index, UnstructuredMarqoIndex) else None, device="cpu", @@ -450,7 +450,7 @@ def test_multimodal_tensor_combination_score(self): with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): def get_score(document): self.clear_indexes(self.indexes) - res = self.add_documents_and_refresh_index( + res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[document], mappings={"combo_text_image": {"type": "multimodal_combination", @@ -486,7 +486,7 @@ def get_score(document): def test_multimodal_tensor_combination_tensor_value(self): for index in [self.unstructured_unnormalized_multimodal_index, self.structured_unnormalized_multimodal_index]: with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ { "text_field_1": "A rider is riding a horse jumping over the barrier.", @@ -527,7 +527,7 @@ def test_multimodal_tensor_combination_tensor_value(self): } if isinstance(index, UnstructuredMarqoIndex) else None )) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ { "text_field_3": "A rider is riding a horse jumping over the barrier.", @@ -573,7 +573,7 @@ def test_multimodal_tensor_combination_zero_weight(self): with self.subTest(f"Index type: {index.type}. Index name: {index.name}"): def get_score(document): self.clear_indexes(self.indexes) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[document], device="cpu", mappings={ "zero_weight_combo_text_image": { "type": "multimodal_combination", @@ -612,7 +612,7 @@ def pass_through_multimodal(*args, **kwargs): @mock.patch("marqo.tensor_search.tensor_search.vectorise_multimodal_combination_field_unstructured", mock_multimodal_combination) def run(): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_random_multimodal_index.name, docs=[ { "text_field": "A rider is riding a horse jumping over the barrier.", @@ -634,7 +634,7 @@ def run(): "weights": {"image_field": 0.5, "text_field": 0.5}}}, device="cpu", tensor_fields=["combo_text_image"] ) - ) + ) # first multimodal-doc real_field_0, field_content_0 = [call_args for call_args, call_kwargs @@ -677,7 +677,7 @@ def pass_through_multimodal(*args, **kwargs): @mock.patch("marqo.tensor_search.tensor_search.vectorise_multimodal_combination_field_structured", mock_multimodal_combination) def run(): - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.structured_random_multimodal_index.name, docs=[ { "text_field": "A rider is riding a horse jumping over the barrier.", @@ -694,7 +694,7 @@ def run(): "_id": "534", }], ) - ) + ) # TODO: Create function to extract args corresponding to "combo_text_image" only, as there are other fields in the args list. # first multimodal-doc @@ -734,7 +734,7 @@ def pass_through_vectorise(*arg, **kwargs): @mock.patch("marqo.s2_inference.s2_inference.vectorise", mock_vectorise) def run(): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ { "text_field_1": "A rider is riding a horse jumping over the barrier_1.", @@ -793,7 +793,7 @@ def pass_through_vectorise(*arg, **kwargs): @mock.patch("marqo.s2_inference.s2_inference.vectorise", mock_vectorise) def run(): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[ { "text_field_1": "A rider is riding a horse jumping over the barrier_1.", @@ -847,7 +847,7 @@ def pass_through_load_image_from_path(*arg, **kwargs): @mock.patch("marqo.s2_inference.clip_utils.load_image_from_path", mock_load_image_from_path) def run(): - res = self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + res = self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_random_multimodal_index.name, docs=[ { "text_0": "A rider is riding a horse jumping over the barrier_0.", @@ -877,7 +877,7 @@ def run(): def test_lexical_search_on_multimodal_combination(self): # TODO: Make structured index - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_multimodal_index.name, docs=[ { "Title": "Extravehicular Mobility Unit (EMU)", @@ -901,7 +901,7 @@ def test_lexical_search_on_multimodal_combination(self): }}, device="cpu", tensor_fields=["my_combination_field"] )) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_multimodal_index.name, docs=[ { "Title": "text", @@ -923,7 +923,7 @@ def test_lexical_search_on_multimodal_combination(self): "additional_field_1": 0.2, } }}, device="cpu", tensor_fields=["my_combination_field"]) - ) + ) res = tensor_search.search(config=self.config, index_name=self.unstructured_multimodal_index.name, text="search me please", search_method="LEXICAL") assert res["hits"][0]["_id"] == "article_591" @@ -934,7 +934,7 @@ def test_lexical_search_on_multimodal_combination(self): def test_search_with_filtering_and_infer_image_false(self): # TODO: Make structured index - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_random_multimodal_index.name, docs=[ { @@ -994,7 +994,7 @@ def test_multimodal_combination_chunks(self): "_id": "123", } - res = self.add_documents_and_refresh_index( + res = self.add_documents( self.config, add_docs_params=AddDocsParams( docs=[test_doc], index_name=index.name, device="cpu", diff --git a/tests/tensor_search/test_pagination.py b/tests/tensor_search/test_pagination.py index dd7e92c88..d20255525 100644 --- a/tests/tensor_search/test_pagination.py +++ b/tests/tensor_search/test_pagination.py @@ -113,7 +113,7 @@ def test_pagination_single_field(self): 'desc': 'my description'} docs.append(doc) - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=index.name, # Add docs with increasing title word count, so each will have unique tensor and lexical scores @@ -171,7 +171,7 @@ def test_pagination_hybrid(self): "title": title, 'desc': 'my description'} docs.append(doc) - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=index.name, docs=docs, @@ -235,7 +235,7 @@ def test_pagination_hybrid_lexical_tensor_with_modifiers(self): for index in [self.index_structured, self.index_unstructured]: with self.subTest(index=type(index)): # Add documents - add_docs_res = self.add_documents_and_refresh_index( + add_docs_res = self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -340,7 +340,7 @@ def read_var(var): with mock.patch.object(utils, 'read_env_vars_and_defaults', new=read_var): for index in [self.index_structured, self.index_unstructured]: for _ in range(0, num_docs, batch_size): - r = self.add_documents_and_refresh_index( + r = self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=index.name, docs=[{"title": 'my title', 'desc': 'my title'} for i in @@ -561,7 +561,7 @@ def test_lexical_search_pagination_empty_searchable_attribs(self): d1 = {"title": "Marqo", "some doc 2": "some other thing", "_id": "abcdef"} d2 = {"some doc 1": "some 2 jnkerkbj", "field abc": "extravagant robodog is not a cat", "_id": "Jupyter_12"} - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, auto_refresh=True, docs=[d0, d1, d2], device="cpu") diff --git a/tests/tensor_search/test_prefix.py b/tests/tensor_search/test_prefix.py index 703c05924..4f5fbab30 100644 --- a/tests/tensor_search/test_prefix.py +++ b/tests/tensor_search/test_prefix.py @@ -129,21 +129,21 @@ def test_prefix_text_chunks(self): for index in [self.unstructured_index_1, self.structured_text_index]: with self.subTest(index=index.type): # A) Add normal text document (1 chunk) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "doc_a", "text": "hello"}], auto_refresh=True, device=self.config.default_device, tensor_fields=["text"] if isinstance(index, UnstructuredMarqoIndex) else None )) # B) Add same text document but WITH PREFIX (1 chunk) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "doc_b", "text": "hello"}], auto_refresh=True, device=self.config.default_device, text_chunk_prefix="PREFIX: ", tensor_fields=["text"] if isinstance(index, UnstructuredMarqoIndex) else None )) # C) Add document with prefix built into text itself (1 chunk) - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "doc_c", "text": "PREFIX: hello"}], auto_refresh=True, device=self.config.default_device, tensor_fields=["text"] if isinstance(index, UnstructuredMarqoIndex) else None @@ -189,21 +189,21 @@ def test_prefix_text_chunks_e5(self): for index in [self.unstructured_index_e5]: with self.subTest(index=index.type): # A) prefix should default to "passage: " with the e5-small model - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "doc_a", "text": "hello"}], auto_refresh=True, device=self.config.default_device, tensor_fields=["text"] if isinstance(index, UnstructuredMarqoIndex) else None )) # B) manually set prefix at the request level - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "doc_b", "text": "hello"}], auto_refresh=True, device=self.config.default_device, text_chunk_prefix="passage: ", tensor_fields=["text"] if isinstance(index, UnstructuredMarqoIndex) else None )) # C) Set no prefix - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=index.name, docs=[{"_id": "doc_c", "text": "hello"}], auto_refresh=True, device=self.config.default_device, text_chunk_prefix="custom_prefix: ", tensor_fields=["text"] if isinstance(index, UnstructuredMarqoIndex) else None @@ -252,7 +252,7 @@ def test_prefix_multimodal(self): for index in [self.unstructured_index_multimodal]: with self.subTest(index=index.type): # Add a multimodal doc with a text and image field - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -274,7 +274,7 @@ def test_prefix_multimodal(self): ) ) - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index.name, @@ -364,7 +364,7 @@ def test_determine_text_chunk_prefix(self): self.assertEqual(result, "test passage: ") # doc_a should default to the override prefix - self.add_documents_and_refresh_index(config=self.config, add_docs_params=AddDocsParams( + self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.unstructured_index_with_override.name, docs=[{"_id": "doc_a", "text": "hello"}], auto_refresh=True, device=self.config.default_device, tensor_fields=["text"] if isinstance(self.unstructured_index_with_override, UnstructuredMarqoIndex) else None diff --git a/tests/tensor_search/test_search.py b/tests/tensor_search/test_search.py index 0c8551f41..c44848c12 100644 --- a/tests/tensor_search/test_search.py +++ b/tests/tensor_search/test_search.py @@ -901,7 +901,7 @@ def test_limit_results_none(self): vocab = requests.get(vocab_source).text.splitlines() - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams(index_name=self.index_name_1, docs=[{"Title": "a " + ( " ".join(random.choices(population=vocab, k=25)))} @@ -1170,7 +1170,7 @@ def test_multi_search_images_edge_cases(self): docs=docs, auto_refresh=True ) invalid_queries = [{}, None, {123: 123}, {'123': None}, - {"https://marqo_not_real.com/image_1.png": 3}, set()] + {"https://marqo-not-real.com/image_1.png": 3}, set()] for q in invalid_queries: try: tensor_search.search( diff --git a/tests/tensor_search/test_searchable_attributes.py b/tests/tensor_search/test_searchable_attributes.py index c90dd7794..faa641991 100644 --- a/tests/tensor_search/test_searchable_attributes.py +++ b/tests/tensor_search/test_searchable_attributes.py @@ -51,7 +51,7 @@ def tearDown(self) -> None: self.device_patcher.stop() def _add_documents(self, index_name): - self.add_documents_and_refresh_index( + self.add_documents( config=self.config, add_docs_params=AddDocsParams( index_name=index_name, From 0a83a6afcfbc32bb56495ceaf84daff17a742338 Mon Sep 17 00:00:00 2001 From: Li Wan <49334982+wanliAlex@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:57:10 +1100 Subject: [PATCH 3/7] Upgrade NLTK and Pillow packages and change base image to 36 --- .github/workflows/cuda_docker_marqo.yml | 20 +-- Dockerfile | 2 +- requirements.dev.txt | 4 +- src/marqo/s2_inference/processing/text.py | 6 + tests/s2_inference/test_encoding.py | 72 ++++---- .../test_add_documents_combined.py | 162 +++++++++++++++++- 6 files changed, 223 insertions(+), 43 deletions(-) diff --git a/.github/workflows/cuda_docker_marqo.yml b/.github/workflows/cuda_docker_marqo.yml index 5a0d3604b..7e41147c2 100644 --- a/.github/workflows/cuda_docker_marqo.yml +++ b/.github/workflows/cuda_docker_marqo.yml @@ -66,11 +66,11 @@ jobs: name: Run CUDA Docker Marqo API Tests needs: Start-Runner # required to start the main job when the runner is ready runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner - - environment: marqo-test-suite - + + environment: marqo-test-suite + steps: - + - name: Checkout marqo repo uses: actions/checkout@v3 with: @@ -81,13 +81,13 @@ jobs: with: python-version: "3.8" cache: "pip" - + - name: Install Dependencies run: | #pip install -r requirements.txt pip install tox==3.26 pip install flake8 - + - name: Set MQ_PY_MARQO_BRANCH variable run: | if [[ "${{ inputs.py_marqo_branch }}" == "marqo" ]]; then @@ -97,21 +97,21 @@ jobs: else echo "MQ_PY_MARQO_BRANCH=git+https://github.com/marqo-ai/py-marqo.git@${{ inputs.py_marqo_branch }}" >> $GITHUB_ENV fi - + - name: Checkout marqo-api-tests repo uses: actions/checkout@v3 with: repository: marqo-ai/marqo-api-tests ref: ${{ github.event.inputs.api_tests_branch }} - + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - + - name: Set up Environment run: | # Set up conf file echo 'export MARQO_API_TESTS_ROOT="${{ github.workspace }}"' >> conf - + - name: Run CUDA Integration Tests - CUDA Docker Marqo run: | export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" diff --git a/Dockerfile b/Dockerfile index 9ec82fa4f..cfcb97225 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ COPY vespa . RUN mvn clean package # Stage 2: Base image for Python setup -FROM marqoai/marqo-base:30 as base_image +FROM marqoai/marqo-base:36 as base_image # Allow mounting volume containing data and configs for vespa VOLUME /opt/vespa/var diff --git a/requirements.dev.txt b/requirements.dev.txt index 039473af2..c0a2b19e8 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -23,10 +23,10 @@ huggingface-hub==0.25.0 more_itertools boto3==1.25.4 botocore==1.28.4 -nltk==3.7 +nltk==3.9.1 torch==1.12.1 torchvision==0.13.1 -Pillow==9.3.0 +Pillow==10.4.0 numpy==1.23.4 validators==0.20.0 sentence-transformers==2.2.2 diff --git a/src/marqo/s2_inference/processing/text.py b/src/marqo/s2_inference/processing/text.py index 9c9cc3ef3..5029d67d1 100644 --- a/src/marqo/s2_inference/processing/text.py +++ b/src/marqo/s2_inference/processing/text.py @@ -30,6 +30,12 @@ def _splitting_functions(split_by: str, language: str='english') -> FunctionType except LookupError: nltk.download("punkt") + # Punkt_tab needs to be downloaded after NLTK 3.8 and later + try: + nltk.data.find("tokenizers/punkt_tab") + except LookupError: + nltk.download("punkt_tab") + MAPPING = { 'character':list, 'word': partial(word_tokenize, language=language), diff --git a/tests/s2_inference/test_encoding.py b/tests/s2_inference/test_encoding.py index c5e8f2ac4..d2fdf987b 100644 --- a/tests/s2_inference/test_encoding.py +++ b/tests/s2_inference/test_encoding.py @@ -26,7 +26,8 @@ def tearDown(self) -> None: clear_loaded_models() def test_vectorize(self): - names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32', + names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", + 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32', "all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6", "hf/bge-small-en-v1.5", "onnx/all-MiniLM-L6-v1", "onnx/all_datasets_v4_MiniLM-L6"] @@ -38,7 +39,7 @@ def test_vectorize(self): names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"] names = names + names_e5 + names_bge + names_snowflake - + sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']] device = 'cpu' eps = 1e-9 @@ -58,6 +59,40 @@ def test_vectorize(self): clear_loaded_models() + def test_vectorize_normalise(self): + open_clip_names = ["open_clip/ViT-B-32/laion2b_s34b_b79k"] + + names_bge = ["hf/bge-small-en-v1.5", "hf/bge-base-en-v1.5"] + + names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"] + + names = open_clip_names + names_bge + names_snowflake + + sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']] + device = 'cpu' + eps = 1e-9 + + for name in names: + model_properties = get_model_properties_from_registry(name) + model = _load_model(model_properties['name'], model_properties=model_properties, device=device) + + for sentence in sentences: + output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True) + assert _check_output_type(output_v) + output_m = model.encode(sentence, normalize=True) + assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps + for vector in output_v: + assert abs(np.linalg.norm(np.array(vector)) - 1) < 1e-5 + + output_v_unnormalised = vectorise(name, sentence, model_properties, device, normalize_embeddings=False) + assert _check_output_type(output_v) + output_m_unnormalised = model.encode(sentence, normalize=False) + assert abs(torch.FloatTensor(output_v_unnormalised) - torch.FloatTensor(output_m_unnormalised)).sum() < eps + for vector in output_v_unnormalised: + assert abs(np.linalg.norm(np.array(vector)) - 1) > 1e-5 + + clear_loaded_models() + def test_cpu_encode_type(self): names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32', "all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6", @@ -252,29 +287,6 @@ def test_model_un_normalization(self): clear_loaded_models() - def test_onnx_clip_vectorise(self): - names = ["onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32'] - - sentences = ['hello', 'this is a test sentence. so is this.', - ['hello', 'this is a test sentence. so is this.']] - device = 'cpu' - eps = 1e-9 - - for name in names: - model_properties = get_model_properties_from_registry(name) - model = _load_model(model_properties['name'], model_properties=model_properties, device=device) - - for sentence in sentences: - output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True) - - assert _check_output_type(output_v) - - output_m = model.encode(sentence, normalize=True) - - assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps - - clear_loaded_models() - class TestOpenClipModelEncoding(unittest.TestCase): ''' @@ -307,13 +319,15 @@ def test_open_clip_vectorize(self): model = _load_model(model_properties['name'], model_properties=model_properties, device=device) for sentence in sentences: - output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True) + for normalize_embeddings in [True, False]: + output_v = vectorise(name, sentence, model_properties, device, + normalize_embeddings=normalize_embeddings) - assert _check_output_type(output_v) + assert _check_output_type(output_v) - output_m = model.encode(sentence, normalize=True) + output_m = model.encode(sentence, normalize=normalize_embeddings) - assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps + assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps clear_loaded_models() diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 36cdfda6f..21c3b90b4 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -1,8 +1,12 @@ import os +import unittest.mock import uuid from unittest import mock from unittest.mock import patch +import PIL +import numpy as np + import numpy as np import pytest @@ -12,13 +16,16 @@ import torch from more_itertools import flatten from torch import Tensor + import unittest.mock from marqo.core.models.marqo_get_documents_by_id_response import MarqoGetDocumentsByIdsResponse from marqo.core.models.marqo_index import * from marqo.core.models.marqo_index_request import FieldRequest from marqo.s2_inference import types +from marqo.s2_inference.multimodal_model_load import infer_modality from marqo.tensor_search import add_docs +from marqo.tensor_search import streaming_media_processor from marqo.tensor_search import tensor_search from marqo.core.models.add_docs_params import AddDocsParams, BatchVectorisationMode from tests.marqo_test import MarqoTestCase, TestImageUrls @@ -52,6 +59,33 @@ def setUpClass(cls) -> None: tensor_fields=["image_field_1", "text_field_1", "text_field_2", "multimodal_field"] ) + structured_image_index_request_unnormalized = cls.structured_marqo_index_request( + name="structured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + fields=[ + FieldRequest(name="image_field_1", type=FieldType.ImagePointer), + FieldRequest(name="text_field_1", type=FieldType.Text, + features=[FieldFeature.Filter, FieldFeature.LexicalSearch]), + ], + model=Model(name="open_clip/ViT-B-32/laion2b_s34b_b79k"), + tensor_fields=["image_field_1", "text_field_1"], + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + + structured_text_index_request_unnormalized = cls.structured_marqo_index_request( + name="structured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + fields=[ + FieldRequest( + name="text_field_1", type=FieldType.Text, + features=[FieldFeature.Filter, FieldFeature.LexicalSearch] + ), + ], + model=Model(name="hf/e5-base-v2"), + tensor_fields=["text_field_1"], + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + structured_languagebind_index_request = cls.structured_marqo_index_request( name="my-multimodal-index" + str(uuid.uuid4()).replace('-', ''), fields=[ @@ -112,21 +146,46 @@ def setUpClass(cls) -> None: marqo_version='2.12.0' ) + unstructured_image_index_request_unnormalized = cls.unstructured_marqo_index_request( + name="unstructured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + model=Model(name="open_clip/ViT-B-32/laion2b_s34b_b79k"), + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + + unstructured_text_index_request_unnormalized = cls.unstructured_marqo_index_request( + name="unstructured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + model=Model(name="hf/e5-base-v2"), + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + cls.indexes = cls.create_indexes([ structured_image_index_request, semi_structured_image_index_request, unstructured_image_index_request, + structured_languagebind_index_request, semi_structured_languagebind_index_request, - unstructured_languagebind_index_request + unstructured_languagebind_index_request, + + unstructured_image_index_request_unnormalized, + unstructured_text_index_request_unnormalized, + structured_image_index_request_unnormalized, + structured_text_index_request_unnormalized ]) cls.structured_marqo_index_name = structured_image_index_request.name cls.structured_languagebind_index_name = structured_languagebind_index_request.name cls.semi_structured_marqo_index_name = semi_structured_image_index_request.name cls.semi_structured_languagebind_index_name = semi_structured_languagebind_index_request.name + cls.structured_image_index_unnormalized_name = structured_image_index_request_unnormalized.name + cls.structured_text_index_unnormalized_name = structured_text_index_request_unnormalized.name + cls.unstructured_marqo_index_name = unstructured_image_index_request.name cls.unstructured_languagebind_index_name = unstructured_languagebind_index_request.name + cls.unstructured_image_index_unnormalized_name = unstructured_image_index_request_unnormalized.name + cls.unstructured_text_index_unnormalized_name = unstructured_text_index_request_unnormalized.name cls.image_indexes = cls.indexes[:3] cls.languagebind_indexes = cls.indexes[3:6] @@ -933,3 +992,104 @@ def get_docs(): assert_get_documents_response_equals( docs_added_using_per_field_strategy, docs_added_using_per_batch_strategy, msg=f'per_field strategy differs from per_batch strategy for index type: {index.type}') + + + def test_imageIndexEmbeddingsUnnormalised(self): + """Test to ensure that the image embeddings are unnormalised when the index is unnormalised""" + documents = [ + { + "image_field_1": TestImageUrls.HIPPO_REALISTIC.value, + "_id": "1" + } + ] + for index_name in [self.unstructured_image_index_unnormalized_name, self.structured_image_index_unnormalized_name]: + tensor_fields = ["image_field_1"] if index_name == self.unstructured_image_index_unnormalized_name \ + else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + for item in res.dict(exclude_none=True, by_alias=True)['items']: + self.assertEqual(200, item['status']) + + get_res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index_name, + document_ids=["1"], + show_vectors=True + ).dict(exclude_none=True, by_alias=True) + + embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] + norm = np.linalg.norm(np.array(embeddings)) + self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}") + + def test_imageIndexEmbeddingsNormalised(self): + """Test to ensure that the image embeddings are normalised when the index is normalised""" + + documents = [ + { + "image_field_1": TestImageUrls.HIPPO_REALISTIC.value, + "_id": "1" + } + ] + for index_name in [self.unstructured_marqo_index_name, self.unstructured_marqo_index_name]: + tensor_fields = ["image_field_1"] if index_name == self.unstructured_marqo_index_name \ + else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + for item in res.dict(exclude_none=True, by_alias=True)['items']: + self.assertEqual(200, item['status']) + + get_res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index_name, + document_ids=["1"], + show_vectors=True + ).dict(exclude_none=True, by_alias=True) + + embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] + norm = np.linalg.norm(np.array(embeddings)) + self.assertTrue(norm - 1.0 < 1e-5, f"Embedding norm is {norm}") + + def test_textIndexEmbeddingsUnnormalized(self): + """A test to ensure that the text embeddings are unnormalised when the index is unnormalised""" + documents = [ + { + "text_field_1": "This is a test text", + "_id": "1" + } + ] + for index_name in [self.unstructured_text_index_unnormalized_name, self.structured_text_index_unnormalized_name]: + tensor_fields = ["text_field_1"] if index_name == self.unstructured_text_index_unnormalized_name \ + else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + for item in res.dict(exclude_none=True, by_alias=True)['items']: + self.assertEqual(200, item['status']) + + get_res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index_name, + document_ids=["1"], + show_vectors=True + ).dict(exclude_none=True, by_alias=True) + + embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] + norm = np.linalg.norm(np.array(embeddings)) + self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}") \ No newline at end of file From 055237ae6c4a8121b4026650582f3a23bd416564 Mon Sep 17 00:00:00 2001 From: Raynor Chavez Date: Mon, 14 Oct 2024 13:41:20 +1100 Subject: [PATCH 4/7] 2.12.2 release notes (#1000) --- RELEASE.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index be468ee18..d30e57a22 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,8 +1,14 @@ +# Release 2.12.2 + +## Bug fixes and minor changes +- Upgrade the `Pillow` and `nltk` packages ([#989](https://github.com/marqo-ai/marqo/pull/989)). +- Fix a bug where `normalizeEmbeddings=False` is not honored for some indexes ([#994](https://github.com/marqo-ai/marqo/pull/994)). + # Release 2.12.1 ## Bug fixes and minor changes -- Fix a bug where when `treatUrlsAndPointersAsImages` is unset and `treatUrlsAndPointersAsMedia` is set, Marqo returns an error where `treatUrlsAndPointersAsImages` cannot be `False` when `treatUrlsAndPointersAsMedia` is `True` -- Add new video-audio model `LanguageBind/Video_V1.5_FT_Audio_FT` to the model registry. +- Fix a bug where when `treatUrlsAndPointersAsImages` is unset and `treatUrlsAndPointersAsMedia` is set, Marqo returns an error where `treatUrlsAndPointersAsImages` cannot be `False` when `treatUrlsAndPointersAsMedia` is `True` ([#971](https://github.com/marqo-ai/marqo/commit/a0084a86d5cf797616a1f8e185eba87417edbc15)) +- Add new video-audio model `LanguageBind/Video_V1.5_FT_Audio_FT` to the model registry ([#971](https://github.com/marqo-ai/marqo/commit/a0084a86d5cf797616a1f8e185eba87417edbc15)). # Release 2.12.0 From a2dcdab5a6332ab8ff09dd7fecf0434d6f571584 Mon Sep 17 00:00:00 2001 From: Li Wan <49334982+wanliAlex@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:27:18 +1100 Subject: [PATCH 5/7] Move test workflows to OpenSource AWS Account and Add tags (#997) --- .github/workflows/arm64_docker_marqo.yml | 31 ++++++++------ .github/workflows/cpu_docker_marqo.yml | 36 ++++++++++++----- .github/workflows/cpu_local_marqo.yml | 29 +++++++++----- .github/workflows/cuda_docker_marqo.yml | 26 ++++++++---- .github/workflows/largemodel_unit_test_CI.yml | 25 ++++++++---- .github/workflows/unit_test_200gb_CI.yml | 40 ++++++++++++++----- 6 files changed, 129 insertions(+), 58 deletions(-) diff --git a/.github/workflows/arm64_docker_marqo.yml b/.github/workflows/arm64_docker_marqo.yml index c447fb187..92060d75d 100644 --- a/.github/workflows/arm64_docker_marqo.yml +++ b/.github/workflows/arm64_docker_marqo.yml @@ -46,21 +46,28 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - - # ARM Runner Image - ec2-image-id: ${{ secrets.ARM_EC2_IMAGE_ID }} - ec2-instance-type: t4g.xlarge - subnet-id: ${{ secrets.ARM_SUBNET_ID }} - security-group-id: ${{ secrets.ARM_SECURITY_GROUP_ID }} + ec2-image-id: ${{ vars.MARQO_CPU_ARM64_TESTS_INSTANCE_AMI }} + ec2-instance-type: m6i.xlarge + subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} + security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"}, + {"Key": "GitHubRepo", "Value": "${{ github.repository }}"}, + {"Key": "WorkflowName", "Value": "${{ github.workflow }}"}, + {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"}, + {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"}, + {"Key": "PoloRole", "Value": "testing"} + ] Test-Marqo: name: Run ARM64 Docker Maqo API Tests @@ -133,9 +140,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: diff --git a/.github/workflows/cpu_docker_marqo.yml b/.github/workflows/cpu_docker_marqo.yml index ba2221d44..ae3a5a669 100644 --- a/.github/workflows/cpu_docker_marqo.yml +++ b/.github/workflows/cpu_docker_marqo.yml @@ -32,10 +32,17 @@ on: - mainline paths-ignore: - '**.md' + pull_request: + branches: + - mainline permissions: contents: read +concurrency: + group: cpu-docker-api-tests-${{ github.ref }} + cancel-in-progress: true + jobs: Start-Runner: name: Start self-hosted EC2 runner @@ -47,19 +54,28 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ${{ secrets.AMD_EC2_IMAGE_ID }} - ec2-instance-type: t3.xlarge - subnet-id: ${{ secrets.AMD_SUBNET_ID }} - security-group-id: ${{ secrets.AMD_SECURITY_GROUP_ID }} + ec2-image-id: ${{ vars.MARQO_CPU_AMD64_TESTS_INSTANCE_AMI }} + ec2-instance-type: m6i.xlarge + subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} + security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"}, + {"Key": "GitHubRepo", "Value": "${{ github.repository }}"}, + {"Key": "WorkflowName", "Value": "${{ github.workflow }}"}, + {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"}, + {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"}, + {"Key": "PoloRole", "Value": "testing"} + ] Test-Marqo: name: Run CPU Docker Marqo API Tests @@ -132,9 +148,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: diff --git a/.github/workflows/cpu_local_marqo.yml b/.github/workflows/cpu_local_marqo.yml index 94eece8ec..65b4fe5b0 100644 --- a/.github/workflows/cpu_local_marqo.yml +++ b/.github/workflows/cpu_local_marqo.yml @@ -54,19 +54,28 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ${{ secrets.AMD_EC2_IMAGE_ID }} - ec2-instance-type: t3.xlarge - subnet-id: ${{ secrets.AMD_SUBNET_ID }} - security-group-id: ${{ secrets.AMD_SECURITY_GROUP_ID }} + ec2-image-id: ${{ vars.MARQO_CPU_AMD64_TESTS_INSTANCE_AMI }} + ec2-instance-type: m6i.xlarge + subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} + security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"}, + {"Key": "GitHubRepo", "Value": "${{ github.repository }}"}, + {"Key": "WorkflowName", "Value": "${{ github.workflow }}"}, + {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"}, + {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"}, + {"Key": "PoloRole", "Value": "testing"} + ] Test-Marqo: name: Run CPU Local Marqo API Tests @@ -137,9 +146,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: diff --git a/.github/workflows/cuda_docker_marqo.yml b/.github/workflows/cuda_docker_marqo.yml index 7e41147c2..fd05e4319 100644 --- a/.github/workflows/cuda_docker_marqo.yml +++ b/.github/workflows/cuda_docker_marqo.yml @@ -47,9 +47,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.LARGEMODELTEST_ACCESSKEY }} - aws-secret-access-key: ${{ secrets.LARGEMODELTEST_SECRETACCESSKEY }} - aws-region: ${{ secrets.LARGEMODELTEST_AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 @@ -60,8 +60,18 @@ jobs: # CUDA AMD64 instance ec2-image-id: ${{ vars.MARQO_CUDA_TESTS_INSTANCE_AMI }} ec2-instance-type: g4dn.2xlarge - subnet-id: ${{ secrets.LARGEMODELTEST_SUBNET_ID }} - security-group-id: ${{ secrets.LARGEMODELTEST_SECURITY_GROUP }} + subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} + security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"}, + {"Key": "GitHubRepo", "Value": "${{ github.repository }}"}, + {"Key": "WorkflowName", "Value": "${{ github.workflow }}"}, + {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"}, + {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"}, + {"Key": "PoloRole", "Value": "testing"} + ] + Test-Marqo: name: Run CUDA Docker Marqo API Tests needs: Start-Runner # required to start the main job when the runner is ready @@ -131,9 +141,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.LARGEMODELTEST_ACCESSKEY }} - aws-secret-access-key: ${{ secrets.LARGEMODELTEST_SECRETACCESSKEY }} - aws-region: ${{ secrets.LARGEMODELTEST_AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: diff --git a/.github/workflows/largemodel_unit_test_CI.yml b/.github/workflows/largemodel_unit_test_CI.yml index 3f3d7da03..ff8e63c22 100644 --- a/.github/workflows/largemodel_unit_test_CI.yml +++ b/.github/workflows/largemodel_unit_test_CI.yml @@ -31,9 +31,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.LARGEMODELTEST_ACCESSKEY }} - aws-secret-access-key: ${{ secrets.LARGEMODELTEST_SECRETACCESSKEY }} - aws-region: ${{ secrets.LARGEMODELTEST_AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 @@ -42,8 +42,17 @@ jobs: github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} ec2-image-id: ${{ vars.MARQO_CUDA_TESTS_INSTANCE_AMI }} ec2-instance-type: g4dn.2xlarge - subnet-id: ${{ secrets.LARGEMODELTEST_SUBNET_ID }} - security-group-id: ${{ secrets.LARGEMODELTEST_SECURITY_GROUP }} + subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} + security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"}, + {"Key": "GitHubRepo", "Value": "${{ github.repository }}"}, + {"Key": "WorkflowName", "Value": "${{ github.workflow }}"}, + {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"}, + {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"}, + {"Key": "PoloRole", "Value": "testing"} + ] Test-Marqo: name: Run Large Model Unit Tests @@ -155,9 +164,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.LARGEMODELTEST_ACCESSKEY }} - aws-secret-access-key: ${{ secrets.LARGEMODELTEST_SECRETACCESSKEY }} - aws-region: ${{ secrets.LARGEMODELTEST_AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: diff --git a/.github/workflows/unit_test_200gb_CI.yml b/.github/workflows/unit_test_200gb_CI.yml index 806ee11ed..785a2af39 100644 --- a/.github/workflows/unit_test_200gb_CI.yml +++ b/.github/workflows/unit_test_200gb_CI.yml @@ -31,19 +31,28 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ${{ secrets.AMD_200GB_EC2_IMAGE_ID }} - ec2-instance-type: t3.2xlarge - subnet-id: ${{ secrets.AMD_SUBNET_ID }} - security-group-id: ${{ secrets.AMD_SECURITY_GROUP_ID }} + ec2-image-id: ${{ vars.MARQO_CPU_AMD64_TESTS_INSTANCE_AMI }} + ec2-instance-type: m6i.xlarge + subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} + security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "marqo-github-runner-${{ github.run_id }}"}, + {"Key": "GitHubRepo", "Value": "${{ github.repository }}"}, + {"Key": "WorkflowName", "Value": "${{ github.workflow }}"}, + {"Key": "WorkflowRunId", "Value": "${{ github.run_id }}"}, + {"Key": "WorlflowURL", "Value": "${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}"}, + {"Key": "PoloRole", "Value": "testing"} + ] Test-Marqo: name: Run Unit Tests @@ -80,6 +89,17 @@ jobs: systemctl stop unattended-upgrades apt-get remove -y unattended-upgrades + # Function to wait for the dpkg lock to be released + function wait_for_dpkg_lock() { + while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do + echo "Waiting for the dpkg lock to be released..." + sleep 5 + done + } + + # Wait for the dpkg lock before updating and installing + wait_for_dpkg_lock + echo "Updating package list" apt-get update -y @@ -156,9 +176,9 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.MARQO_WORKFLOW_TESTS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.MARQO_WORKFLOW_TESTS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: From 5bd504ec8cb0293c90e1b9802a4e983dca1d26e0 Mon Sep 17 00:00:00 2001 From: Li Wan <49334982+wanliAlex@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:43:03 +1100 Subject: [PATCH 6/7] Change the arm64 instance (#1014) --- .github/workflows/arm64_docker_marqo.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/arm64_docker_marqo.yml b/.github/workflows/arm64_docker_marqo.yml index 92060d75d..25118af1b 100644 --- a/.github/workflows/arm64_docker_marqo.yml +++ b/.github/workflows/arm64_docker_marqo.yml @@ -56,7 +56,7 @@ jobs: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} ec2-image-id: ${{ vars.MARQO_CPU_ARM64_TESTS_INSTANCE_AMI }} - ec2-instance-type: m6i.xlarge + ec2-instance-type: r7g.large subnet-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SUBNET_ID }} security-group-id: ${{ secrets.MARQO_WORKFLOW_TESTS_SECURITY_GROUP_ID }} aws-resource-tags: > # optional, requires additional permissions From a2c4ff4442f9e275376132721469151c309d73bd Mon Sep 17 00:00:00 2001 From: aditya bharadwaj Date: Fri, 18 Oct 2024 09:52:38 +0530 Subject: [PATCH 7/7] Updating Developer Guide README.md to include Hybrid search specific documentation (#955) --- src/marqo/README.md | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/marqo/README.md b/src/marqo/README.md index 1d0b1dbfa..f8dcaca86 100644 --- a/src/marqo/README.md +++ b/src/marqo/README.md @@ -7,7 +7,7 @@ See [here](https://github.com/marqo-ai/marqo/blob/mainline/CONTRIBUTING.md#unit- ## Running Marqo locally (outside of docker) for development There are two ways to run Marqo locally (outside of docker) for development: Option A. through `uvicorn`, -Option B. through your IDE (e.g. PyCharm). +Option B. through your IDE (e.g., PyCharm). We highly recommend using the Option A, as it allows you to set breakpoints and debug Marqo. Before running Marqo locally, you will need to do some preparations to set up Vespa locally. @@ -35,14 +35,39 @@ docker run --detach --name vespa --hostname vespa-tutorial \ ```bash (cd scripts/vespa_local && zip -r - * | curl --header "Content-Type:application/zip" --data-binary @- http://localhost:19071/application/v2/tenant/default/prepareandactivate) ``` - You can verify that Vespa has been set up correctly by visiting `http://localhost:8080` in your browser. +5. We need to install Java Development Kit and Maven to build the jar file for the custom searchers. + +Install Java Development Kit: + +You can install the Java Development Kit (JDK) by following the instructions [here](https://docs.oracle.com/en/java/javase/22/install/overview-jdk-installation.html). +Post installation, remember to set JAVA_HOME Environment Variable and to add Java to the PATH Environment Variable. +You can verify the installation by running the following command: +```bash +java -version +``` + +Install Maven: + +You can install Maven by following the instructions [here](https://maven.apache.org/install.html). +Similar to setting Java, you need to add Maven to the PATH Environment Variable. +Verify Maven installation by running the following command: +```bash +mvn -version +``` + +6. After this you need to create a JAR file. To do that, go into the Vespa directory in your local Marqo repository by doing `cd vespa`, and run +```bash +mvn clean package +``` +After running this command, you will see that a target folder gets created in the vespa directory, which contains a JAR file called marqo-custom-searchers-deploy.jar. This JAR file is used to deploy the custom searchers to Vespa. + ### Option A. Run the Marqo application locally (outside of docker) through IDE -Now you can run Marqo locally through your IDE (e.g. PyCharm) by following the steps below. +Now you can run Marqo locally through your IDE by following the steps below. -5. Open the Marqo project in your IDE (e.g. PyCharm) and go to the file `src/marqo/tensor_search/api.py` -6. Set up your [debug configuration](https://www.jetbrains.com/help/pycharm/creating-run-debug-configuration-for-tests.html) +7. Open the Marqo project in your IDE and go to the file `src/marqo/tensor_search/api.py` +8. Set up your [debug configuration](https://www.jetbrains.com/help/pycharm/creating-run-debug-configuration-for-tests.html) to run `api.py` with the following environment variables: ``` MARQO_ENABLE_BATCH_APIS=true @@ -52,14 +77,14 @@ VESPA_CONFIG_URL=http://localhost:19071 VESPA_DOCUMENT_URL=http://localhost:8080 VESPA_QUERY_URL=http://localhost:8080 ``` -7. Now you can Debug this file directly from your IDE (e.g. PyCharm) to start Marqo locally. -8. Set breakpoints in the project for better debugging experience. +9. Now you can debug this file directly from your IDE (e.g., PyCharm) to start Marqo locally. +10. Set breakpoints in the project for better debugging experience. ### Option B. Run the Marqo application locally (outside of docker) through `uvicorn` Finish the preparations above, then run the following command: -5. Set up the environment variables and run Marqo through `uvicorn` +7. Set up the environment variables and run Marqo through `uvicorn` ```bash export MARQO_ENABLE_BATCH_APIS=true export MARQO_LOG_LEVEL=debug @@ -126,7 +151,7 @@ Marqo outside Docker will rely on the system setup to use the GPU. If you can us #### Using Marqo within Docker Currently, only CUDA based (Nvidia) GPU's are supported. If you have a GPU on the host machine and want to use it with Marqo, there are two things to do; 1. Add the `--gpus all` flag to the `docker run` command. This command excluded from the above but will allow the GPU's to be used within Marqo. For example, in the steps B., C., and D., above `--gpus all` should be added after the -`docker run --name marqo` part of the command, e.g. B. from above would become, +`docker run --name marqo` part of the command, e.g., B. from above would become, ```bash docker rm -f marqo && DOCKER_BUILDKIT=1 docker build . -t marqo_docker_0 &&