Skip to content

Commit

Permalink
tasks: add margin time window for search to purge deleted documents
Browse files Browse the repository at this point in the history
  • Loading branch information
zzacharo committed Aug 25, 2023
1 parent 5740a61 commit 9e51951
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 11 deletions.
15 changes: 11 additions & 4 deletions invenio_drafts_resources/records/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"""

import uuid
from datetime import datetime
from datetime import datetime, timedelta

from invenio_pidstore.models import PIDStatus
from invenio_pidstore.providers.recordid_v2 import RecordIdProviderV2
Expand Down Expand Up @@ -232,12 +232,19 @@ def edit(cls, record):
return draft

@classmethod
def cleanup_drafts(cls, td):
def cleanup_drafts(cls, td, search_gc_deletes=60):
"""Clean up (hard delete) all the soft deleted drafts.
The drafts in the last timedelta span of time won't be deleted.
The soft-deleted drafts in the last timedelta span of time won't be deleted,
including `search_gc_deletes` seconds timedelta. This ensures that only
drafts fully removed from the search cluster can be hard-deleted (e.g. when
`td` is very short), avoiding search conflicts.
:param int search_gc_deletes: time in seconds, corresponding to the search cluster
setting `index.gc_deletes` (see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-delete.html#delete-versioning),
default to 60 seconds. Search cluster caches deleted documents for `index.gc_deletes` seconds.
"""
timestamp = datetime.utcnow() - td
timestamp = datetime.utcnow() - td - timedelta(seconds=search_gc_deletes)
draft_model = cls.model_cls
models = draft_model.query.filter(
draft_model.is_deleted == True, # noqa
Expand Down
7 changes: 5 additions & 2 deletions invenio_drafts_resources/services/records/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,10 +630,13 @@ def _index_related_records(self, record, parent, uow=None):
uow.register(RecordIndexOp(sibling, indexer=self.indexer))

@unit_of_work()
def cleanup_drafts(self, timedelta, uow=None):
def cleanup_drafts(self, timedelta, uow=None, search_gc_deletes=60):
"""Hard delete of soft deleted drafts.
:param int timedelta: timedelta that should pass since
the last update of the draft in order to be hard deleted.
:param int search_gc_deletes: time in seconds, corresponding to the search cluster
setting `index.gc_deletes` (see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-delete.html#delete-versioning),
default to 60 seconds. Search cluster caches deleted documents for `index.gc_deletes` seconds.
"""
self.draft_cls.cleanup_drafts(timedelta)
self.draft_cls.cleanup_drafts(timedelta, search_gc_deletes=search_gc_deletes)
7 changes: 5 additions & 2 deletions invenio_drafts_resources/services/records/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@


@shared_task(ignore_result=True)
def cleanup_drafts(seconds=3600):
def cleanup_drafts(seconds=3600, search_gc_deletes=60):
"""Hard delete of soft deleted drafts.
:param int seconds: numbers of seconds that should pass since the
last update of the draft in order to be hard deleted.
:param int search_gc_deletes: time in seconds, corresponding to the search cluster
setting `index.gc_deletes` (see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-delete.html#delete-versioning),
default to 60 seconds. Search cluster caches deleted documents for `index.gc_deletes` seconds.
"""
timedelta_param = timedelta(seconds=seconds)
service = current_service_registry.get("records")
service.cleanup_drafts(timedelta_param)
service.cleanup_drafts(timedelta_param, search_gc_deletes=search_gc_deletes)
2 changes: 1 addition & 1 deletion tests/services/test_record_service_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_hard_delete_soft_deleted_task(app, service, identity_simple, input_data
assert (
len(draft_model.query.filter(draft_model.is_deleted == True).all()) == 1 # noqa
)
cleanup_drafts(seconds=0)
cleanup_drafts(seconds=0, search_gc_deletes=0)

assert (
len(draft_model.query.filter(draft_model.is_deleted == True).all()) == 0 # noqa
Expand Down
4 changes: 2 additions & 2 deletions tests/services/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_hard_delete_soft_deleted(app, service, identity_simple, input_data):
assert (
len(draft_model.query.filter(draft_model.is_deleted == True).all()) == 1 # noqa
)
service.cleanup_drafts(timedelta(seconds=0))
service.cleanup_drafts(timedelta(seconds=0), search_gc_deletes=0)

assert (
len(draft_model.query.filter(draft_model.is_deleted == True).all()) == 0 # noqa
Expand All @@ -55,7 +55,7 @@ def test_hard_delete_soft_deleted_not_enough_time(
assert (
len(draft_model.query.filter(draft_model.is_deleted == True).all()) == 1 # noqa
)
service.cleanup_drafts(timedelta(seconds=10))
service.cleanup_drafts(timedelta(seconds=10), search_gc_deletes=0)

assert (
len(draft_model.query.filter(draft_model.is_deleted == True).all()) == 1 # noqa
Expand Down

0 comments on commit 9e51951

Please sign in to comment.