diff --git a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py index cf3afefeb95..34a3e0a8864 100644 --- a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py +++ b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py @@ -31,6 +31,7 @@ def __init__(self, index: DocumentIndex): def delete_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -48,11 +49,11 @@ def delete_single( def update_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, fields: VespaDocumentFields, ) -> int: - print("Update single") return self.index.update_single( doc_id, tenant_id=tenant_id, diff --git a/backend/onyx/document_index/interfaces.py b/backend/onyx/document_index/interfaces.py index 85bfb7ae3d6..08dbfdc9efb 100644 --- a/backend/onyx/document_index/interfaces.py +++ b/backend/onyx/document_index/interfaces.py @@ -136,7 +136,7 @@ def __init__( index_name: str, secondary_index_name: str | None, *args: Any, - **kwargs: Any + **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.index_name = index_name @@ -221,6 +221,7 @@ class Deletable(abc.ABC): def delete_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -247,8 +248,9 @@ class Updatable(abc.ABC): def update_single( self, doc_id: str, - chunk_count: int | None, + *, tenant_id: str | None, + chunk_count: int | None, fields: VespaDocumentFields, ) -> int: """ @@ -269,7 +271,7 @@ def update_single( @abc.abstractmethod def update( - self, update_requests: list[UpdateRequest], tenant_id: str | None + self, update_requests: list[UpdateRequest], *, tenant_id: str | None ) -> None: """ Updates some set of chunks. The document and fields to update are specified in the update diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 4dc42a9671c..78cfea85656 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -433,7 +433,7 @@ def _update_chunk( raise requests.HTTPError(failure_msg) from e def update( - self, update_requests: list[UpdateRequest], tenant_id: str | None + self, update_requests: list[UpdateRequest], *, tenant_id: str | None ) -> None: logger.debug(f"Updating {len(update_requests)} documents in Vespa") @@ -561,6 +561,7 @@ def update_single_chunk( def update_single( self, doc_id: str, + *, chunk_count: int | None, tenant_id: str | None, fields: VespaDocumentFields, @@ -610,6 +611,7 @@ def update_single( def delete_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -795,7 +797,12 @@ def enrich_basic_chunk_info( return enriched_doc_info @classmethod - def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None: + def delete_entries_by_tenant_id( + cls, + *, + tenant_id: str, + index_name: str, + ) -> None: """ Deletes all entries in the specified index with the given tenant_id. diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py index 3f0bf01ad61..39b98b9bcb8 100755 --- a/backend/scripts/force_delete_connector_by_id.py +++ b/backend/scripts/force_delete_connector_by_id.py @@ -5,6 +5,7 @@ from sqlalchemy import delete from sqlalchemy.orm import Session +from onyx.db.document import delete_documents_complete__no_commit from onyx.db.enums import ConnectorCredentialPairStatus # Modify sys.path @@ -70,14 +71,17 @@ def _unsafe_deletion( if not documents: break - # document_ids = [document.id for document in documents] - # for doc_id in document_ids: - # document_index.delete_single(doc_id) + for document in documents: + document_index.delete_single( + doc_id=document.id, + tenant_id=None, + chunk_count=document.chunk_count, + ) - # delete_documents_complete__no_commit( - # db_session=db_session, - # document_ids=document_ids, - # ) + delete_documents_complete__no_commit( + db_session=db_session, + document_ids=[document.id for document in documents], + ) num_docs_deleted += len(documents) @@ -215,6 +219,7 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None: parser.add_argument( "connector_id", type=int, help="The ID of the connector to delete" ) + args = parser.parse_args() with get_session_context_manager() as db_session: _delete_connector(args.connector_id, db_session) diff --git a/backend/scripts/orphan_doc_cleanup_script.py b/backend/scripts/orphan_doc_cleanup_script.py index 6d404f7f52e..baf52d93b05 100644 --- a/backend/scripts/orphan_doc_cleanup_script.py +++ b/backend/scripts/orphan_doc_cleanup_script.py @@ -15,6 +15,7 @@ from onyx.db.document import delete_documents_complete__no_commit # noqa: E402 from onyx.db.search_settings import get_current_search_settings # noqa: E402 from onyx.document_index.vespa.index import VespaIndex # noqa: E402 +from onyx.db.document import get_document # noqa: E402 BATCH_SIZE = 100 @@ -63,6 +64,10 @@ def main() -> None: with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: def process_doc(doc_id: str) -> str | None: + document = get_document(doc_id, db_session) + if not document: + print(f"Document {doc_id} not found in Postgres") + return None # Check if document exists in Vespa first try: chunks = vespa_index.id_based_retrieval( @@ -83,11 +88,13 @@ def process_doc(doc_id: str) -> str | None: try: print(f"Deleting document {doc_id} in Vespa") - # chunks_deleted = vespa_index.delete_single(doc_id) - # if chunks_deleted > 0: - # print( - # f"Deleted {chunks_deleted} chunks for document {doc_id}" - # ) + chunks_deleted = vespa_index.delete_single( + doc_id, tenant_id=None, chunk_count=document.chunk_count + ) + if chunks_deleted > 0: + print( + f"Deleted {chunks_deleted} chunks for document {doc_id}" + ) return doc_id except Exception as e: print(