From f42a5d56a25b34c43823a35753b7dc2dd60a4fac Mon Sep 17 00:00:00 2001 From: Nicola Tarocco Date: Sat, 2 Sep 2023 10:38:27 +0200 Subject: [PATCH 1/2] uow: add ParentRecordCommitOp uow - adds a new uow to commit a parent record and bulk index all records and drafts --- .github/workflows/tests.yml | 25 +++-- invenio_drafts_resources/records/api.py | 38 +++++-- .../services/records/service.py | 29 +---- .../services/records/uow.py | 69 ++++++++++++ tests/records/test_api.py | 40 ++++++- tests/services/test_record_service.py | 32 ------ tests/services/test_records_uow.py | 101 ++++++++++++++++++ 7 files changed, 259 insertions(+), 75 deletions(-) create mode 100644 invenio_drafts_resources/services/records/uow.py create mode 100644 tests/services/test_records_uow.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 81d7efe..6ddce0a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,28 +29,39 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.8, 3.9] + python-version: [3.7, 3.8, 3.9] requirements-level: [pypi] - db-service: [postgresql11, postgresql13] + db-service: [postgresql12, postgresql14] search-service: [opensearch2, elasticsearch7] exclude: + - python-version: 3.7 + db-service: postgresql14 + search-service: opensearch2 + + - python-version: 3.7 + db-service: postgresql14 + search-service: elasticsearch7 + - python-version: 3.8 - db-service: postgresql11 + db-service: postgresql12 search-service: opensearch2 - python-version: 3.8 - db-service: postgresql11 + db-service: postgresql12 search-service: elasticsearch7 - python-version: 3.9 - db-service: postgresql11 + db-service: postgresql12 search-service: opensearch2 - python-version: 3.9 - db-service: postgresql11 + db-service: postgresql12 search-service: elasticsearch7 include: - - db-service: postgresql13 + - db-service: postgresql12 + DB_EXTRAS: "postgresql" + + - db-service: postgresql14 DB_EXTRAS: "postgresql" - search-service: opensearch2 diff --git a/invenio_drafts_resources/records/api.py b/invenio_drafts_resources/records/api.py index f03915a..2694008 100644 --- a/invenio_drafts_resources/records/api.py +++ b/invenio_drafts_resources/records/api.py @@ -90,16 +90,38 @@ class Record(RecordBase): versions = VersionsField(create=True, set_latest=True) @classmethod - def get_records_by_parent(cls, parent, include_deleted=True): + def get_records_by_parent(cls, parent, with_deleted=True, ids_only=False): """Get all sibling records for the specified parent record.""" - versions = cls.model_cls.query.filter_by(parent_id=parent.id) - if not include_deleted: - versions = versions.filter_by(is_deleted=False) + rec_models = cls.model_cls.query.filter_by(parent_id=parent.id) + if not with_deleted: + rec_models = rec_models.filter_by(is_deleted=False) - return ( - cls(rec_model.data, model=rec_model, parent=parent) - for rec_model in versions - ) + if ids_only: + return (rec_model.id for rec_model in rec_models) + else: + return ( + cls(rec_model.data, model=rec_model, parent=parent) + for rec_model in rec_models + ) + + @classmethod + def get_latest_by_parent(cls, parent, id_only=False): + """Get the latest record for the specified parent record. + + It might return None if there is no latest published version yet. + """ + version = cls.versions_model_cls.query.filter_by( + parent_id=parent.id + ).one_or_none() + has_latest = version and version.latest_id + if not has_latest: + return None + + rec_model = cls.model_cls.query.filter_by(id=version.latest_id).one() + if id_only: + return rec_model.id + else: + return cls(rec_model.data, model=rec_model, parent=parent) @classmethod def publish(cls, draft): diff --git a/invenio_drafts_resources/services/records/service.py b/invenio_drafts_resources/services/records/service.py index cb4562e..af65196 100644 --- a/invenio_drafts_resources/services/records/service.py +++ b/invenio_drafts_resources/services/records/service.py @@ -9,8 +9,6 @@ """Primary service for working with records and drafts.""" -import itertools - from flask import current_app from invenio_db import db from invenio_records_resources.services import LinksTemplate @@ -28,6 +26,8 @@ from sqlalchemy.orm.exc import NoResultFound from werkzeug.local import LocalProxy +from .uow import ParentRecordCommitOp + class RecordService(RecordServiceBase): """Record and draft service interface. @@ -291,7 +291,7 @@ def create(self, identity, data, uow=None, expand=False): uow=uow, expand=expand, ) - uow.register(RecordCommitOp(res._record.parent)) + uow.register(ParentRecordCommitOp(res._record.parent)) return res @unit_of_work() @@ -612,29 +612,6 @@ def _get_draft_and_parent_by_id(self, id_): return draft, parent - @unit_of_work() - def _index_related_records(self, record, parent, uow=None): - """Index all records that are related to the specified ones. - - Soft deleted records (including published drafts) will not be indexed - because the JSON payload is empty. - """ - _parent = parent or record.parent - siblings = self.record_cls.get_records_by_parent(_parent, include_deleted=False) - - if self.draft_cls is not None: - # if drafts are available, reindex them as well - drafts = self.draft_cls.get_records_by_parent( - _parent, include_deleted=False - ) - siblings = itertools.chain(siblings, drafts) - - # TODO only index the current record immediately; - # all siblings should be sent to a high-priority celery task - # instead (requires bulk indexing to work) - for sibling in siblings: - uow.register(RecordIndexOp(sibling, indexer=self.indexer)) - @unit_of_work() def cleanup_drafts(self, timedelta, uow=None, search_gc_deletes=60): """Hard delete of soft deleted drafts. diff --git a/invenio_drafts_resources/services/records/uow.py b/invenio_drafts_resources/services/records/uow.py new file mode 100644 index 0000000..163fdd7 --- /dev/null +++ b/invenio_drafts_resources/services/records/uow.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 CERN. +# +# Invenio-Drafts-Resources is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""Unit of work.""" + +from invenio_records_resources.services.uow import RecordCommitOp + + +class ParentRecordCommitOp(RecordCommitOp): + """Parent record commit operation, bulk indexing records and drafts.""" + + def __init__( + self, + parent, + indexer_context=None, + ): + """Initialize the parent record commit operation. + + :params indexer_context: a dict containing record/draft cls and indexers, + or service. Expected keys: `record_cls, draft_cls, indexer, draft_indexer`. + `None` if indexing disabled. + """ + super().__init__(parent, indexer=None) + self._indexer_context = indexer_context + if indexer_context: + _service = indexer_context.get("service", None) + if _service is not None: + self._record_cls = getattr(_service, "record_cls") + self._draft_cls = getattr(_service, "draft_cls") + self._record_indexer = getattr(_service, "indexer") + self._draft_indexer = getattr(_service, "draft_indexer") + else: + self._record_cls = indexer_context["record_cls"] + self._draft_cls = indexer_context["draft_cls"] + self._record_indexer = indexer_context["indexer"] + self._draft_indexer = indexer_context["draft_indexer"] + + def _get_siblings(self): + """Get all record and draft versions by parent. + + This operation might be slow when a record has a high number of versions and drafts. + However, given that often versions should be re-indexed as soon as possible when the parent + is committed, the fetching operation is done synchronously. + """ + records_ids = self._record_cls.get_records_by_parent( + self._record, with_deleted=False, ids_only=True + ) + drafts_ids = self._draft_cls.get_records_by_parent( + self._record, with_deleted=False, ids_only=True + ) + return records_ids, drafts_ids + + def on_commit(self, uow): + """No commit operation.""" + pass + + def on_post_commit(self, uow): + """Bulk index as last operation.""" + if self._indexer_context is not None: + records_ids, drafts_ids = self._get_siblings() + if records_ids: + self._record_indexer.bulk_index(records_ids) + if drafts_ids: + self._draft_indexer.bulk_index(drafts_ids) diff --git a/tests/records/test_api.py b/tests/records/test_api.py index 48f94c0..fa564a7 100644 --- a/tests/records/test_api.py +++ b/tests/records/test_api.py @@ -360,7 +360,7 @@ def test_get_records_by_parent(app, db, location): drafts = list(Draft.get_records_by_parent(parent)) assert len(drafts) == 1 assert id(parent) == id(drafts[0].parent) - drafts = list(Draft.get_records_by_parent(parent, include_deleted=False)) + drafts = list(Draft.get_records_by_parent(parent, with_deleted=False)) assert len(drafts) == 0 records = list(Record.get_records_by_parent(parent)) assert len(records) == 1 @@ -395,7 +395,7 @@ def test_get_records_by_parent(app, db, location): parent = record_v2.parent drafts = list(Draft.get_records_by_parent(parent)) assert len(drafts) == 2 - drafts = list(Draft.get_records_by_parent(parent, include_deleted=False)) + drafts = list(Draft.get_records_by_parent(parent, with_deleted=False)) assert len(drafts) == 0 records = list(Record.get_records_by_parent(parent)) assert len(records) == 2 @@ -406,3 +406,39 @@ def test_get_records_by_parent(app, db, location): assert state.latest_id == record_v2.id assert state.latest_index == 2 assert state.next_draft_id is None + + +# +# Get latest by parent +# +def test_get_latest_by_parent(app, db, location): + """Test get latest by parent.""" + draft_v1 = Draft.create({}) + db.session.commit() + parent = draft_v1.parent + + # nothing published yet + assert not Draft.get_latest_by_parent(parent) + assert not Record.get_latest_by_parent(parent) + + record_v1 = Record.publish(draft_v1) + draft_v1.delete() # simulate service `publish`, will soft delete drafts + db.session.commit() + parent = record_v1.parent + + assert not Draft.get_latest_by_parent(parent) + assert Record.get_latest_by_parent(parent).id == record_v1.id + + draft_v2 = Draft.new_version(record_v1) + draft_v2.commit() + db.session.commit() + + assert Draft.get_latest_by_parent(parent).id == record_v1.id + assert Record.get_latest_by_parent(parent).id == record_v1.id + + record_v2 = Record.publish(draft_v2) + draft_v2.delete() # simulate service `publish`, will soft delete drafts + db.session.commit() + + assert not Draft.get_latest_by_parent(parent) + assert Record.get_latest_by_parent(parent).id == record_v2.id diff --git a/tests/services/test_record_service.py b/tests/services/test_record_service.py index f0610e0..dcb4010 100644 --- a/tests/services/test_record_service.py +++ b/tests/services/test_record_service.py @@ -15,11 +15,8 @@ - Read with missing pid """ -from io import BytesIO - import pytest from invenio_db import db -from invenio_files_rest.errors import InvalidOperationError from invenio_pidstore.errors import PIDDoesNotExistError, PIDUnregistered from invenio_pidstore.models import PIDStatus from marshmallow.exceptions import ValidationError @@ -360,32 +357,3 @@ def test_read_latest_version(app, service, identity_simple, input_data): # Check that parent returns latest version latest = service.read_latest(identity_simple, parent_recid) assert latest["id"] == recid_2 - - -def test_reindexing_all_siblings(app, service, identity_simple, input_data): - """Test if reindexing sibling records includes drafts.""" - record = create_and_publish(service, identity_simple, input_data)._obj - recid = record["id"] - - # Edit the record, and make it somehow out of sync with the index - draft = service.edit(identity_simple, recid)._obj - draft.metadata["title"] = "Test (draft)" - draft.commit() - db.session.commit() - draft.index.refresh() - - # Make sure that the DB is correct, but the index is out of date - # (note: service.search_draft(...) would always give me 0 results here) - metadata = service.read_draft(identity_simple, recid).data["metadata"] - assert metadata["title"] == "Test (draft)" - hits = [hit for hit in service.draft_cls.index.search() if hit["id"] == recid] - assert hits - assert hits[0]["metadata"]["title"] == record.metadata["title"] - - # Reindex all sibling records and drafts - # and make sure that the draft is updated now - service._index_related_records(record, parent=record.parent) - draft.index.refresh() - hits = [hit for hit in service.draft_cls.index.search() if hit["id"] == recid] - assert hits - assert hits[0]["metadata"]["title"] == draft.metadata["title"] diff --git a/tests/services/test_records_uow.py b/tests/services/test_records_uow.py new file mode 100644 index 0000000..4617559 --- /dev/null +++ b/tests/services/test_records_uow.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020-2023 CERN. +# +# Invenio-Drafts-Resources is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""UOW tests.""" + +from unittest.mock import MagicMock + +import pytest +from invenio_records_resources.services.uow import UnitOfWork + +from invenio_drafts_resources.services.records.uow import ParentRecordCommitOp + +from .utils import create_and_publish + + +def test_parent_record_uow_params(app, db, service): + """Test constructors params.""" + # test that it does not raise when params are passed correctly + assert ParentRecordCommitOp({}, dict(service=service)) + assert ParentRecordCommitOp( + {}, + indexer_context=dict( + record_cls=service.record_cls, + draft_cls=service.draft_cls, + indexer=MagicMock(), + draft_indexer=MagicMock(), + ), + ) + + # test that it raises when one of the expected attr of the service is missing + for missing_attr in ["record_cls", "draft_cls", "indexer", "draft_indexer"]: + _service = type( + "service", + (object,), + dict( + record_cls=service.record_cls, + draft_cls=service.draft_cls, + indexer=service.indexer, + draft_indexer=service.draft_indexer, + ), + ) + with pytest.raises(AttributeError): + delattr(_service, missing_attr) + assert ParentRecordCommitOp({}, indexer_context=dict(service=_service)) + + # test that it raises when one of the expected constructor params is missing + for missing_param in [ + "record_cls", + "draft_cls", + "indexer", + "draft_indexer", + ]: + params = dict( + record_cls=service.record_cls, + draft_cls=service.draft_cls, + indexer=service.indexer, + draft_indexer=service.draft_indexer, + ) + with pytest.raises(KeyError): + params.pop(missing_param) + assert ParentRecordCommitOp({}, indexer_context=dict(**params)) + + +def test_parent_record_commit(app, db, service, identity_simple, input_data): + """Test ParentRecordCommit unit of work.""" + record_v1 = create_and_publish(service, identity_simple, input_data) + recid = record_v1.id + draft_v1 = service.edit(identity_simple, recid) + draft_v2 = service.new_version(identity_simple, recid) + record_v2 = service.publish(identity_simple, draft_v2.id) + + parent = record_v1._record.parent + + indexer = MagicMock() + draft_indexer = MagicMock() + + with UnitOfWork(db.session) as uow: + op = ParentRecordCommitOp( + parent, + indexer_context=dict( + record_cls=service.record_cls, + draft_cls=service.draft_cls, + indexer=indexer, + draft_indexer=draft_indexer, + ), + ) + uow.register(op) + uow.commit() + + assert indexer.bulk_index.call_count == 1 + args, _ = indexer.bulk_index.call_args + assert list(args[0]) == [record_v1._record.id, record_v2._record.id] + + assert draft_indexer.bulk_index.call_count == 1 + args, _ = draft_indexer.bulk_index.call_args + assert list(args[0]) == [draft_v1._record.id] From 0069b399311fbf52282e62c9d5f14959ead4fd15 Mon Sep 17 00:00:00 2001 From: Zacharias Zacharodimos Date: Tue, 5 Sep 2023 23:38:40 +0200 Subject: [PATCH 2/2] release: v1.10.0 --- .github/workflows/tests.yml | 14 +------------- CHANGES.rst | 4 ++++ invenio_drafts_resources/__init__.py | 2 +- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6ddce0a..65c10f3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,19 +29,11 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: [3.8, 3.9] requirements-level: [pypi] db-service: [postgresql12, postgresql14] search-service: [opensearch2, elasticsearch7] exclude: - - python-version: 3.7 - db-service: postgresql14 - search-service: opensearch2 - - - python-version: 3.7 - db-service: postgresql14 - search-service: elasticsearch7 - - python-version: 3.8 db-service: postgresql12 search-service: opensearch2 @@ -58,9 +50,6 @@ jobs: db-service: postgresql12 search-service: elasticsearch7 include: - - db-service: postgresql12 - DB_EXTRAS: "postgresql" - - db-service: postgresql14 DB_EXTRAS: "postgresql" @@ -69,7 +58,6 @@ jobs: - search-service: elasticsearch7 SEARCH_EXTRAS: "elasticsearch7" - env: DB: ${{ matrix.db-service }} SEARCH: ${{ matrix.search-service }} diff --git a/CHANGES.rst b/CHANGES.rst index 695506e..0b4ec98 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -9,6 +9,10 @@ Changes ======= +Version 1.10.0 (2023-09-05) + +- uow: add ParentRecordCommitOp uow + Version 1.9.0 (2023-09-05) - service: allow passing of 'extra_filter' via kwargs to searches diff --git a/invenio_drafts_resources/__init__.py b/invenio_drafts_resources/__init__.py index 38a5a36..c870f2d 100644 --- a/invenio_drafts_resources/__init__.py +++ b/invenio_drafts_resources/__init__.py @@ -10,6 +10,6 @@ """Invenio Drafts Resources module to create REST APIs.""" -__version__ = "1.9.0" +__version__ = "1.10.0" __all__ = ("__version__",)