inveniosoftware · kpsherva · Oct 10, 2024 · Aug 23, 2024 · Sep 10, 2024 · Sep 11, 2024
diff --git a/invenio_vocabularies/contrib/common/ror/datastreams.py b/invenio_vocabularies/contrib/common/ror/datastreams.py
@@ -21,7 +21,11 @@
 
 
 class RORHTTPReader(BaseReader):
-    """ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
+    """ROR HTTP Reader.
+
+    Returning an in-memory
+    binary stream of the latest ROR data dump ZIP file.
+    """
 
     def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
         """Constructor."""
@@ -30,7 +34,8 @@ def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
 
     def _iter(self, fp, *args, **kwargs):
         raise NotImplementedError(
-            "RORHTTPReader downloads one file and therefore does not iterate through items"
+            "RORHTTPReader downloads one file "
+            "and therefore does not iterate through items"
         )
 
     def _get_last_dump_date(self, linksets):
@@ -53,11 +58,16 @@ def _get_last_dump_date(self, linksets):
                     return last_dump_date
         else:
             raise ReaderError(
-                "Couldn't find JSON-LD in publisher's linkset to determine last dump date."
+                "Couldn't find JSON-LD in publisher's linkset "
+                "to determine last dump date."
             )
 
     def read(self, item=None, *args, **kwargs):
-        """Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it."""
+        """Reads the latest ROR data dump.
+
+        Read from ZIP file from
+        Zenodo and yields an in-memory binary stream of it.
+        """
         if item:
             raise NotImplementedError(
                 "RORHTTPReader does not support being chained after another reader"
@@ -68,7 +78,8 @@ def read(self, item=None, *args, **kwargs):
         landing_page = requests.get(dataset_doi_link, allow_redirects=True)
         landing_page.raise_for_status()
 
-        # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the ROR data dump.
+        # Call the signposting `linkset+json` endpoint for
+        # the Concept DOI (i.e. latest version) of the ROR data dump.
         # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
         if "linkset" not in landing_page.links:
             raise ReaderError("Linkset not found in the ROR dataset record.")
@@ -94,8 +105,10 @@ def read(self, item=None, *args, **kwargs):
                 raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")
 
         # Download the ZIP file and fully load the response bytes content in memory.
-        # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).
-        # Using directly `file_resp.raw` is not possible since `zipfile.ZipFile` requires the file-like object to be seekable.
+        # The bytes content are then wrapped by a BytesIO to be
+        # file-like object (as required by `zipfile.ZipFile`).
+        # Using directly `file_resp.raw` is not possible since
+        # `zipfile.ZipFile` requires the file-like object to be seekable.
         file_resp = requests.get(file_url)
         file_resp.raise_for_status()
         yield io.BytesIO(file_resp.content)

diff --git a/invenio_vocabularies/jobs.py b/invenio_vocabularies/jobs.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2021-2022 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""Jobs module."""
+
+import datetime
+from datetime import timezone
+
+from invenio_i18n import gettext as _
+from invenio_jobs.jobs import JobType
+from marshmallow import Schema, fields
+from marshmallow_utils.fields import TZDateTime
+
+from invenio_vocabularies.services.tasks import process_datastream
+
+
+class ArgsSchema(Schema):
+    """Schema of task input arguments."""
+
+    since = TZDateTime(
+        timezone=timezone.utc,
+        format="iso",
+        metadata={
+            "description": _(
+                "YYYY-MM-DD HH:mm format. "
+                "Leave field empty if it should continue since last successful run."
+            )
+        },
+    )
+    job_arg_schema = fields.String(
+        metadata={"type": "hidden"},
+        dump_default="ArgsSchema",
+        load_default="ArgsSchema",
+    )
+
+
+class ProcessDataStreamJob(JobType):
+    """Generic process data stream job type."""
+
+    arguments_schema = ArgsSchema
+    task = process_datastream
+    id = None
+
+
+class ProcessRORAffiliationsJob(ProcessDataStreamJob):
+    """Process ROR affiliations datastream registered task."""
+
+    description = "Process ROR affiliations"
+    title = "Load ROR affiliations"
+    id = "process_ror_affiliations"
+
+    @classmethod
+    def default_args(cls, job_obj, since=None, **kwargs):
+        """Generate default job arguments here."""
+        if since is None and job_obj.last_runs["success"]:
+            since = job_obj.last_runs["success"].started_at
+        else:
+            since = datetime.datetime.now()
+
+        return {
+            "config": {
+                "readers": [
+                    {
+                        "args": {"since": since},
+                        "type": "ror-http",
+                    },
+                    {"args": {"regex": "_schema_v2\\.json$"}, "type": "zip"},
+                    {"type": "json"},
+                ],
+                "writers": [
+                    {
+                        "args": {
+                            "writer": {
+                                "type": "affiliations-service",
+                                "args": {"update": True},
+                            }
+                        },
+                        "type": "async",
+                    }
+                ],
+                "transformers": [{"type": "ror-affiliations"}],
+            }
+        }
diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,7 @@ install_requires =
     invenio-i18n>=2.0.0,<3.0.0
     invenio-records-resources>=6.0.0,<7.0.0
     invenio-administration>=2.0.0,<3.0.0
+    invenio-jobs>=1.0.0,<2.0.0
     lxml>=4.5.0
     PyYAML>=5.4.1
     regex>=2024.7.24
@@ -110,7 +111,8 @@ invenio_i18n.translations =
     invenio_vocabularies = invenio_vocabularies
 invenio_celery.tasks =
     invenio_vocabularies_services = invenio_vocabularies.services.tasks
-
+invenio_jobs.jobs =
+    process_ror_affiliations = invenio_vocabularies.jobs:ProcessRORAffiliationsJob
 
 [build_sphinx]
 source-dir = docs/