Skip to content

Commit

Permalink
rules: implement more fields
Browse files Browse the repository at this point in the history
  • Loading branch information
kpsherva committed Oct 24, 2024
1 parent c551e0e commit ebc1938
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 155 deletions.
7 changes: 6 additions & 1 deletion cds_migrator_kit/migration_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def _(x): # needed to avoid start time failure with lazy strings
CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml"

from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid
from invenio_rdm_records.config import RDM_RECORDS_IDENTIFIERS_SCHEMES, always_valid, RDM_RECORDS_PERSONORG_SCHEMES
from cds_rdm import schemes

RDM_RECORDS_IDENTIFIERS_SCHEMES = {**RDM_RECORDS_IDENTIFIERS_SCHEMES,
Expand All @@ -402,6 +402,11 @@ def _(x): # needed to avoid start time failure with lazy strings
"validator": schemes.is_inspire,
"datacite": "INSPIRE"}}}

RDM_RECORDS_PERSONORG_SCHEMES = {**RDM_RECORDS_PERSONORG_SCHEMES,
**{"inspire": {"label": _("Inspire"),
"validator": schemes.is_inspire,
"datacite": "INSPIRE"}}}


CDS_MIGRATOR_KIT_RECORD_STATS_STREAM_CONFIG = dict(
####### Search ##############
Expand Down
12 changes: 5 additions & 7 deletions cds_migrator_kit/rdm/migration/load/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,21 +154,19 @@ def _load_versions(self, entry, logger):
"""Load other versions of the record."""
versions = entry["versions"]
legacy_recid = entry["record"]["recid"]
record = entry["record"]
parent = entry["parent"]

def publish_and_mint_recid(draft, version):
record = current_rdm_records_service.publish(system_identity, draft["id"])
record_item = current_rdm_records_service.publish(system_identity, draft["id"])
# mint legacy ids for redirections
if version == 1:
record._record.model.created = arrow.get(
record_item._record.model.created = arrow.get(
entry["record"]["created"]
).datetime
record._record.commit()
record_item._record.commit()
# it seems more intuitive if we mint the lrecid for parent
# but then we get a double redirection
legacy_recid_minter(legacy_recid, record._record.parent.model.id)
return record
legacy_recid_minter(legacy_recid, record_item._record.parent.model.id)
return record_item

identity = system_identity # TODO: load users instead ?

Expand Down
2 changes: 1 addition & 1 deletion cds_migrator_kit/rdm/migration/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ records:
transform:
files_dump_dir: cds_migrator_kit/rdm/migration/data/summer_student_reports/files/
missing_users: cds_migrator_kit/rdm/migration/data/users
community_id: 63448ca7-c814-4716-b099-a39766df6dbb
community_id: 94ae20a9-57f1-4a4e-bcac-4ef802b88e5d
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,17 @@ class CMSSummerStudent(CdsOverdo):

__ignore_keys__ = {
# decided to ignore
# "0247_2", # DOI, summer student notes do not have it
# "0247_a", # DOI
"0248_a", # oai identifier, not needed to migrate, TBD
"0248_p", # oai identifier, not needed to migrate, TBD
"0248_q", # does appear in data, what is this field recid 2778897
"0248_q", # full text tag 2778897
"100__m", # author's email <-- decided not to keep in RDM,
"260__c", # Redundant (more detailed value is in 269__c imprint.pub_date)
"270__m", # document contact email
"595__a", # always value CERN EDS, not displayed, TODO: do we keep?
"595__z", # SOME RECORD HAVE UNCL as value, do we keep it? what does UNCL mean
"700__m", # author's email <-- decided not to keep in RDM,
"710__5", # department / organisation author
"710__a", # organisation author
"700__m", # Contributors (email)
"700__m", # author's email <-- decided not to keep in RDM,
"8564_8", # Files system field
"8564_s", # Files system field
"8564_u", # Files
Expand All @@ -56,22 +53,18 @@ class CMSSummerStudent(CdsOverdo):
"937__s", # modification person
"960__a", # collection id? usually value 12, to confirm if we ignore
"980__a", # collection tag
# "980__c", # MIGRATED/DELETED - it shouldn't even make it here

# TO Implement (to remove from here)
"690C_a", # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
# "562__c", # note
# "700__0", # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
# "693__b", # beams recid: 2640381

# TO DECIDE
# IMPLEMENTED
# "001"
# "003"
# "035__9", # Inspire schema
# "035__a", # Inspire id value
# "037__a", # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
# "041__a", # languages
# "088__a", # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
# "100__9", # #BEARD# tag
# "100__a",
# "100__u", # Author affiliation
# "100__9", # #BEARD# tag
# "246__a",
# "246__i", # abbreviation
# "246__i", # abbreviation tag, applies to value of 246__A
Expand All @@ -83,30 +76,32 @@ class CMSSummerStudent(CdsOverdo):
# "65017a", # subject value
# "6531_9", # keyword provenance
# "6531_a", # keyword
# "690C_a", # collection name, not needed values(to drop: INTNOTE, CERN, otherwise parse PUBL to retrieve the department, if department not present in any other field)
# "6931_9", # keyword
# "6931_a", # keyword
# "693__a", # accelerator, do we create a custom field?
# "693__b", # beams recid: 2640381
# "693__e", # custom_fields.cern:experiments
# "693__f", # facility, do we create a custom field?
# "693__p", # project, do we create a custom field?
# "693__s", # study, do we create a custom field?
# "700__0", # Contributors (cds author id) - TBD if we keep, same with INSPIRE ID
# "700__9", # #BEARD# tag
# "700__a", # Contributors (full name)
# "700__u", # Contributors (affiliation)
# "710__g", # Collaboration, OK to migrate as corporate contributor (not creator)?
# "700__9", # #BEARD# tag
# "859__f", # creator's email, to be used to determine the owner
# "906__p", # names, is it supervisor?
# "916__n",
# "916__s",
# "916__w",
# "963__a",
# "693__a", # accelerator, do we create a custom field?
# "693__f", # facility, do we create a custom field?
# "693__p", # project, do we create a custom field?
# "693__s", # study, do we create a custom field?
# "906__p", # names, is it supervisor?
# "970__a", # alternative identifier, scheme ALEPH
# "037__a", # (Report number) alternative identifiers -> scheme "CDS REFERENCE"
# "088__a", # RN (manual introduced?) second report number (so the identifiers schemas are not unique!)
# "035__9", # Inspire schema
# "035__a", # Inspire id value
"269__a", # imprint place # TODO
}
_default_fields = {
"resource_type": {"id": "publication-technicalnote"}
}
_default_fields = None


model = CMSSummerStudent(
Expand Down
102 changes: 61 additions & 41 deletions cds_migrator_kit/rdm/migration/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ def _recid(self, record_dump):
"""Returns the recid of the record."""
return str(record_dump.data["recid"])

def _pids(self, json_entry):
return {}

def _bucket_id(self, json_entry):
return

Expand Down Expand Up @@ -194,61 +191,85 @@ def get_person_old_db(email):
return user.id

def _metadata(self, json_entry):
def creators(json, key="creators"):
_creators = deepcopy(json.get(key, []))

def affiliations(creator):
vocab_type = "affiliations"
service = current_service_registry.get(vocab_type)
extra_filter = dsl.Q("term", type__id=vocab_type)
_creators = list(filter(lambda x: x is not None, _creators))
for creator in _creators:
affiliations = creator.get("affiliations", [])
transformed_aff = []
for affiliation_name in affiliations:
affiliations = creator.get("affiliations", [])
transformed_aff = []

title = dsl.Q("match", **{f"title": affiliation_name})
acronym = dsl.Q(
"match_phrase", **{f"acronym.keyword": affiliation_name}
)
title_filter = dsl.query.Bool("should", should=[title, acronym])

vocabulary_result = service.search(
system_identity, extra_filter=title_filter | extra_filter
).to_dict()
if vocabulary_result["hits"]["total"]:
transformed_aff.append(
{
"name": affiliation_name,
"id": vocabulary_result["hits"]["hits"][0]["id"],
}
)
else:
raise UnexpectedValue(
subfield="u",
value=affiliation_name,
field="author",
message=f"Affiliation {affiliation_name} not found.",
stage="vocabulary match",
)
for affiliation_name in affiliations:

title = dsl.Q("match", **{f"title": affiliation_name})
acronym = dsl.Q(
"match_phrase", **{f"acronym.keyword": affiliation_name}
)
title_filter = dsl.query.Bool("should", should=[title, acronym])

vocabulary_result = service.search(
system_identity, extra_filter=title_filter | extra_filter
).to_dict()
if vocabulary_result["hits"]["total"]:
transformed_aff.append(
{
"name": affiliation_name,
"id": vocabulary_result["hits"]["hits"][0]["id"],
}
)
else:
raise UnexpectedValue(
subfield="u",
value=affiliation_name,
field="author",
message=f"Affiliation {affiliation_name} not found.",
stage="vocabulary match",
)
creator["affiliations"] = transformed_aff

def creator_identifiers(creator):
processed_identifiers = []
inner_dict = creator.get("person_or_org", {})
identifiers = inner_dict.get("identifiers", [])
for identifier in identifiers:
# TODO process CDS and CERN Ids when names vocabulary ready
if identifier["scheme"] == "inspire":
processed_identifiers.append(identifier)
if processed_identifiers:

inner_dict["identifiers"] = processed_identifiers
else:
inner_dict.pop("identifiers", None)

def creators(json, key="creators"):
_creators = deepcopy(json.get(key, []))
_creators = list(filter(lambda x: x is not None, _creators))
for creator in _creators:
affiliations(creator)
creator_identifiers(creator)
return _creators

def _resource_type(data):
t = "publication-technicalnote"
st = None
return {"id": f"{t}-{st}"} if st else {"id": t}
return {
def _resource_type(entry):
return entry["resource_type"]

metadata = {
"creators": creators(json_entry),
"title": json_entry["title"],
"resource_type": _resource_type(json_entry),
"description": json_entry.get("description"),
"publication_date": json_entry.get("publication_date"),
"contributors": creators(json_entry, key="contributors"),
"notes": json_entry.get("internal_notes"),
"subjects": json_entry.get("subjects"),
"publisher": json_entry.get("publisher"),
"additional_descriptions": json_entry.get("additional_descriptions"),
"identifiers": json_entry.get("identifiers")
"identifiers": json_entry.get("identifiers"),
"languages": json_entry.get("languages"),
"_internal_notes": json_entry.get("internal_notes"),
# "imprint": json_entry.get("imprint"), # TODO
}
# filter empty keys
return {k: v for k, v in metadata.items() if v}

def _custom_fields(self, json_entry):

Expand Down Expand Up @@ -377,7 +398,6 @@ def transform(self, entry):
json_output = {
"created": self._created(json_data),
"updated": self._updated(record_dump),
"pids": self._pids(json_data),
"files": self._files(record_dump),
"metadata": self._metadata(json_data),
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def get_contributor_role(subfield, role, raise_unexpected=False):


def get_contributor_affiliations(info):
aff_results = []
u = info.get("u", "")
if not u:
return
Expand All @@ -155,15 +154,12 @@ def extract_json_contributor_ids(info):
author_ids = force_list(info.get("0", ""))
for author_id in author_ids:
match = regex.match(author_id)
# if match:
# ids.append(
# {"identifier": match.group(3), "scheme": SOURCES[match.group(1)]}
# )
# pass
try:
ids.append({"identifier": info["inspireid"], "scheme": "inspire"})
except KeyError:
pass
if match:
identifier = match.group(3)
identifier = identifier.replace("INSPIRE-", "")
ids.append(
{"identifier": identifier, "scheme": SOURCES[match.group(1)]}
)

author_orcid = info.get("k")
if author_orcid:
Expand Down
Loading

0 comments on commit ebc1938

Please sign in to comment.