Skip to content

Commit

Permalink
clean up args
Browse files Browse the repository at this point in the history
  • Loading branch information
pablonyx committed Jan 7, 2025
1 parent 6d5b69e commit 9875d3a
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, index: DocumentIndex):
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
Expand All @@ -48,11 +49,11 @@ def delete_single(
def update_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
print("Update single")
return self.index.update_single(
doc_id,
tenant_id=tenant_id,
Expand Down
8 changes: 5 additions & 3 deletions backend/onyx/document_index/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def __init__(
index_name: str,
secondary_index_name: str | None,
*args: Any,
**kwargs: Any
**kwargs: Any,
) -> None:
super().__init__(*args, **kwargs)
self.index_name = index_name
Expand Down Expand Up @@ -221,6 +221,7 @@ class Deletable(abc.ABC):
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
Expand All @@ -247,8 +248,9 @@ class Updatable(abc.ABC):
def update_single(
self,
doc_id: str,
chunk_count: int | None,
*,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
"""
Expand All @@ -269,7 +271,7 @@ def update_single(

@abc.abstractmethod
def update(
self, update_requests: list[UpdateRequest], tenant_id: str | None
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
"""
Updates some set of chunks. The document and fields to update are specified in the update
Expand Down
11 changes: 9 additions & 2 deletions backend/onyx/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def _update_chunk(
raise requests.HTTPError(failure_msg) from e

def update(
self, update_requests: list[UpdateRequest], tenant_id: str | None
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
logger.debug(f"Updating {len(update_requests)} documents in Vespa")

Expand Down Expand Up @@ -561,6 +561,7 @@ def update_single_chunk(
def update_single(
self,
doc_id: str,
*,
chunk_count: int | None,
tenant_id: str | None,
fields: VespaDocumentFields,
Expand Down Expand Up @@ -610,6 +611,7 @@ def update_single(
def delete_single(
self,
doc_id: str,
*,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
Expand Down Expand Up @@ -795,7 +797,12 @@ def enrich_basic_chunk_info(
return enriched_doc_info

@classmethod
def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None:
def delete_entries_by_tenant_id(
cls,
*,
tenant_id: str,
index_name: str,
) -> None:
"""
Deletes all entries in the specified index with the given tenant_id.
Expand Down
19 changes: 12 additions & 7 deletions backend/scripts/force_delete_connector_by_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sqlalchemy import delete
from sqlalchemy.orm import Session

from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.enums import ConnectorCredentialPairStatus

# Modify sys.path
Expand Down Expand Up @@ -70,14 +71,17 @@ def _unsafe_deletion(
if not documents:
break

# document_ids = [document.id for document in documents]
# for doc_id in document_ids:
# document_index.delete_single(doc_id)
for document in documents:
document_index.delete_single(
doc_id=document.id,
tenant_id=None,
chunk_count=document.chunk_count,
)

# delete_documents_complete__no_commit(
# db_session=db_session,
# document_ids=document_ids,
# )
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=[document.id for document in documents],
)

num_docs_deleted += len(documents)

Expand Down Expand Up @@ -215,6 +219,7 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
parser.add_argument(
"connector_id", type=int, help="The ID of the connector to delete"
)

args = parser.parse_args()
with get_session_context_manager() as db_session:
_delete_connector(args.connector_id, db_session)
17 changes: 12 additions & 5 deletions backend/scripts/orphan_doc_cleanup_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
from onyx.db.search_settings import get_current_search_settings # noqa: E402
from onyx.document_index.vespa.index import VespaIndex # noqa: E402
from onyx.db.document import get_document # noqa: E402

BATCH_SIZE = 100

Expand Down Expand Up @@ -63,6 +64,10 @@ def main() -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:

def process_doc(doc_id: str) -> str | None:
document = get_document(doc_id, db_session)
if not document:
print(f"Document {doc_id} not found in Postgres")
return None
# Check if document exists in Vespa first
try:
chunks = vespa_index.id_based_retrieval(
Expand All @@ -83,11 +88,13 @@ def process_doc(doc_id: str) -> str | None:

try:
print(f"Deleting document {doc_id} in Vespa")
# chunks_deleted = vespa_index.delete_single(doc_id)
# if chunks_deleted > 0:
# print(
# f"Deleted {chunks_deleted} chunks for document {doc_id}"
# )
chunks_deleted = vespa_index.delete_single(
doc_id, tenant_id=None, chunk_count=document.chunk_count
)
if chunks_deleted > 0:
print(
f"Deleted {chunks_deleted} chunks for document {doc_id}"
)
return doc_id
except Exception as e:
print(
Expand Down

0 comments on commit 9875d3a

Please sign in to comment.