rules: implement more fields

CERNDocumentServer · Oct 24, 2024 · ebc1938 · ebc1938
1 parent c551e0e
commit ebc1938
Show file tree

Hide file tree

Showing 10 changed files with 202 additions and 155 deletions.
diff --git a/cds_migrator_kit/migration_config.py b/cds_migrator_kit/migration_config.py
@@ -388,7 +388,7 @@ def _(x):  # needed to avoid start time failure with lazy strings
 CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
 CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml"
 
-from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid
+from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid, RDM_RECORDS_PERSONORG_SCHEMES
 from cds_rdm import schemes
 
 RDM_RECORDS_IDENTIFIERS_SCHEMES = {**RDM_RECORDS_IDENTIFIERS_SCHEMES,
@@ -402,6 +402,11 @@ def _(x):  # needed to avoid start time failure with lazy strings
                                                   "validator": schemes.is_inspire,
                                                   "datacite": "INSPIRE"}}}
 
+RDM_RECORDS_PERSONORG_SCHEMES = {**RDM_RECORDS_PERSONORG_SCHEMES,
+                                 **{"inspire": {"label": _("Inspire"),
+                                                "validator": schemes.is_inspire,
+                                                "datacite": "INSPIRE"}}}
+
 
 CDS_MIGRATOR_KIT_RECORD_STATS_STREAM_CONFIG = dict(
     ####### Search ##############

diff --git a/cds_migrator_kit/rdm/migration/load/load.py b/cds_migrator_kit/rdm/migration/load/load.py
@@ -154,21 +154,19 @@ def _load_versions(self, entry, logger):
         """Load other versions of the record."""
         versions = entry["versions"]
         legacy_recid = entry["record"]["recid"]
-        record = entry["record"]
-        parent = entry["parent"]
 
         def publish_and_mint_recid(draft, version):
-            record = current_rdm_records_service.publish(system_identity, draft["id"])
+            record_item = current_rdm_records_service.publish(system_identity, draft["id"])
             # mint legacy ids for redirections
             if version == 1:
-                record._record.model.created = arrow.get(
+                record_item._record.model.created = arrow.get(
                     entry["record"]["created"]
                 ).datetime
-                record._record.commit()
+                record_item._record.commit()
                 # it seems more intuitive if we mint the lrecid for parent
                 # but then we get a double redirection
-                legacy_recid_minter(legacy_recid, record._record.parent.model.id)
-            return record
+                legacy_recid_minter(legacy_recid, record_item._record.parent.model.id)
+            return record_item
 
         identity = system_identity  # TODO: load users instead ?
 

diff --git a/cds_migrator_kit/rdm/migration/streams.yaml b/cds_migrator_kit/rdm/migration/streams.yaml
@@ -16,4 +16,4 @@ records:
   transform:
     files_dump_dir: cds_migrator_kit/rdm/migration/data/summer_student_reports/files/
     missing_users: cds_migrator_kit/rdm/migration/data/users
-    community_id: 63448ca7-c814-4716-b099-a39766df6dbb
+    community_id: 94ae20a9-57f1-4a4e-bcac-4ef802b88e5d
diff --git a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py
@@ -33,20 +33,17 @@ class CMSSummerStudent(CdsOverdo):
 
     __ignore_keys__ = {
         # decided to ignore
-        # "0247_2",  # DOI, summer student notes do not have it
-        # "0247_a",  # DOI
         "0248_a",  # oai identifier, not needed to migrate, TBD
         "0248_p",  # oai identifier, not needed to migrate, TBD
-        "0248_q",  # does appear in data, what is this field recid 2778897
+        "0248_q",  # full text tag 2778897
         "100__m",  # author's email <-- decided not to keep in RDM,
         "260__c",  # Redundant (more detailed value is in 269__c imprint.pub_date)
         "270__m",  # document contact email
         "595__a",  # always value CERN EDS, not displayed, TODO: do we keep?
         "595__z",  # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
+        "700__m",  # author's email <-- decided not to keep in RDM,
         "710__5",  # department / organisation author
         "710__a",  # organisation author
-        "700__m",  # Contributors (email)
-        "700__m",  # author's email <-- decided not to keep in RDM,
         "8564_8",  # Files system field
         "8564_s",  # Files system field
         "8564_u",  # Files
@@ -56,22 +53,18 @@ class CMSSummerStudent(CdsOverdo):
         "937__s",  # modification person
         "960__a",  # collection id? usually value 12, to confirm if we ignore
         "980__a",  # collection tag
-        # "980__c",  # MIGRATED/DELETED - it shouldn't even make it here
-
-        # TO Implement (to remove from here)
-        "690C_a",  # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
-        # "562__c",  # note
-        # "700__0",  # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
-        # "693__b",  # beams recid: 2640381
 
-        # TO DECIDE
         # IMPLEMENTED
         # "001"
         # "003"
+        # "035__9",  # Inspire schema
+        # "035__a",  # Inspire id value
+        # "037__a",  # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
         # "041__a",  # languages
+        # "088__a",  # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
+        # "100__9",  # #BEARD# tag
         # "100__a",
         # "100__u",  # Author affiliation
-        # "100__9",  # #BEARD# tag
         # "246__a",
         # "246__i",  # abbreviation
         # "246__i",  # abbreviation tag, applies to value of 246__A
@@ -83,30 +76,32 @@ class CMSSummerStudent(CdsOverdo):
         # "65017a",  # subject value
         # "6531_9",  # keyword provenance
         # "6531_a",  # keyword
+        # "690C_a",  # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
         # "6931_9",  # keyword
         # "6931_a",  # keyword
+        # "693__a",  # accelerator, do we create a custom field?
+        # "693__b",  # beams recid: 2640381
         # "693__e",  # custom_fields.cern:experiments
+        # "693__f",  # facility, do we create a custom field?
+        # "693__p",  # project, do we create a custom field?
+        # "693__s",  # study,  do we create a custom field?
+        # "700__0",  # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
+        # "700__9",  # #BEARD# tag
         # "700__a",  # Contributors (full name)
         # "700__u",  # Contributors (affiliation)
         # "710__g",  # Collaboration, OK to migrate as corporate contributor (not creator)?
-        # "700__9",  # #BEARD# tag
         # "859__f",  # creator's email, to be used to determine the owner
+        # "906__p",  # names, is it supervisor?
         # "916__n",
         # "916__s",
         # "916__w",
         # "963__a",
-        # "693__a",  # accelerator, do we create a custom field?
-        # "693__f",  # facility, do we create a custom field?
-        # "693__p",  # project, do we create a custom field?
-        # "693__s",  # study,  do we create a custom field?
-        # "906__p",  # names, is it supervisor?
         # "970__a",  # alternative identifier, scheme ALEPH
-        # "037__a",  # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
-        # "088__a",  # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
-        # "035__9",  # Inspire schema
-        # "035__a",  # Inspire id value
+        "269__a", # imprint place # TODO
+    }
+    _default_fields = {
+        "resource_type": {"id": "publication-technicalnote"}
     }
-    _default_fields = None
 
 
 model = CMSSummerStudent(

diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py
@@ -82,9 +82,6 @@ def _recid(self, record_dump):
         """Returns the recid of the record."""
         return str(record_dump.data["recid"])
 
-    def _pids(self, json_entry):
-        return {}
-
     def _bucket_id(self, json_entry):
         return
 
@@ -194,61 +191,85 @@ def get_person_old_db(email):
         return user.id
 
     def _metadata(self, json_entry):
-        def creators(json, key="creators"):
-            _creators = deepcopy(json.get(key, []))
+
+        def affiliations(creator):
             vocab_type = "affiliations"
             service = current_service_registry.get(vocab_type)
             extra_filter = dsl.Q("term", type__id=vocab_type)
-            _creators = list(filter(lambda x: x is not None, _creators))
-            for creator in _creators:
-                affiliations = creator.get("affiliations", [])
-                transformed_aff = []
-                for affiliation_name in affiliations:
+            affiliations = creator.get("affiliations", [])
+            transformed_aff = []
 
-                    title = dsl.Q("match", **{f"title": affiliation_name})
-                    acronym = dsl.Q(
-                        "match_phrase", **{f"acronym.keyword": affiliation_name}
-                    )
-                    title_filter = dsl.query.Bool("should", should=[title, acronym])
 
-                    vocabulary_result = service.search(
-                        system_identity, extra_filter=title_filter | extra_filter
-                    ).to_dict()
-                    if vocabulary_result["hits"]["total"]:
-                        transformed_aff.append(
-                            {
-                                "name": affiliation_name,
-                                "id": vocabulary_result["hits"]["hits"][0]["id"],
-                            }
-                        )
-                    else:
-                        raise UnexpectedValue(
-                            subfield="u",
-                            value=affiliation_name,
-                            field="author",
-                            message=f"Affiliation {affiliation_name} not found.",
-                            stage="vocabulary match",
-                        )
+            for affiliation_name in affiliations:
+
+                title = dsl.Q("match", **{f"title": affiliation_name})
+                acronym = dsl.Q(
+                    "match_phrase", **{f"acronym.keyword": affiliation_name}
+                )
+                title_filter = dsl.query.Bool("should", should=[title, acronym])
+
+                vocabulary_result = service.search(
+                    system_identity, extra_filter=title_filter | extra_filter
+                ).to_dict()
+                if vocabulary_result["hits"]["total"]:
+                    transformed_aff.append(
+                        {
+                            "name": affiliation_name,
+                            "id": vocabulary_result["hits"]["hits"][0]["id"],
+                        }
+                    )
+                else:
+                    raise UnexpectedValue(
+                        subfield="u",
+                        value=affiliation_name,
+                        field="author",
+                        message=f"Affiliation {affiliation_name} not found.",
+                        stage="vocabulary match",
+                    )
                 creator["affiliations"] = transformed_aff
+
+        def creator_identifiers(creator):
+            processed_identifiers = []
+            inner_dict = creator.get("person_or_org", {})
+            identifiers = inner_dict.get("identifiers", [])
+            for identifier in identifiers:
+                # TODO process CDS and CERN Ids when names vocabulary ready
+                if identifier["scheme"] == "inspire":
+                    processed_identifiers.append(identifier)
+            if processed_identifiers:
+
+                inner_dict["identifiers"] = processed_identifiers
+            else:
+                inner_dict.pop("identifiers", None)
+
+        def creators(json, key="creators"):
+            _creators = deepcopy(json.get(key, []))
+            _creators = list(filter(lambda x: x is not None, _creators))
+            for creator in _creators:
+                affiliations(creator)
+                creator_identifiers(creator)
             return _creators
 
-        def _resource_type(data):
-            t = "publication-technicalnote"
-            st = None
-            return {"id": f"{t}-{st}"} if st else {"id": t}
-        return {
+        def _resource_type(entry):
+            return entry["resource_type"]
+
+        metadata = {
             "creators": creators(json_entry),
             "title": json_entry["title"],
             "resource_type": _resource_type(json_entry),
             "description": json_entry.get("description"),
             "publication_date": json_entry.get("publication_date"),
             "contributors": creators(json_entry, key="contributors"),
-            "notes": json_entry.get("internal_notes"),
             "subjects": json_entry.get("subjects"),
             "publisher": json_entry.get("publisher"),
             "additional_descriptions": json_entry.get("additional_descriptions"),
-            "identifiers": json_entry.get("identifiers")
+            "identifiers": json_entry.get("identifiers"),
+            "languages": json_entry.get("languages"),
+            "_internal_notes": json_entry.get("internal_notes"),
+            # "imprint": json_entry.get("imprint"), # TODO
         }
+        # filter empty keys
+        return {k: v for k, v in metadata.items() if v}
 
     def _custom_fields(self, json_entry):
 
@@ -377,7 +398,6 @@ def transform(self, entry):
             json_output = {
                 "created": self._created(json_data),
                 "updated": self._updated(record_dump),
-                "pids": self._pids(json_data),
                 "files": self._files(record_dump),
                 "metadata": self._metadata(json_data),
             }

diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py
@@ -131,7 +131,6 @@ def get_contributor_role(subfield, role, raise_unexpected=False):
 
 
 def get_contributor_affiliations(info):
-    aff_results = []
     u = info.get("u", "")
     if not u:
         return
@@ -155,15 +154,12 @@ def extract_json_contributor_ids(info):
     author_ids = force_list(info.get("0", ""))
     for author_id in author_ids:
         match = regex.match(author_id)
-        # if match:
-        #     ids.append(
-        #         {"identifier": match.group(3), "scheme": SOURCES[match.group(1)]}
-        #     )
-        #     pass
-    try:
-        ids.append({"identifier": info["inspireid"], "scheme": "inspire"})
-    except KeyError:
-        pass
+        if match:
+            identifier = match.group(3)
+            identifier = identifier.replace("INSPIRE-", "")
+            ids.append(
+                {"identifier": identifier, "scheme": SOURCES[match.group(1)]}
+            )
 
     author_orcid = info.get("k")
     if author_orcid: