From 86a0601e1a05ecbbec2c48d238bde6d33f0f1770 Mon Sep 17 00:00:00 2001 From: Hans-Chrstian Date: Mon, 9 Sep 2024 15:38:45 +0200 Subject: [PATCH 1/3] Add creator support for DCAT datasets - Introduced creator fields (URI, name, email, URL, type, identifier) alongside publisher fields. - Updated the RDF graph serialization to include creator details. - Extended existing tests to validate the creator functionality. - Applied similar fallback logic for creator as used for publisher. --- ckanext/dcat/converters.py | 44 +++++++++- ckanext/dcat/profiles/base.py | 49 ++++------- ckanext/dcat/profiles/euro_dcat_ap_base.py | 10 ++- ckanext/dcat/profiles/schemaorg.py | 82 ++++++++++--------- ckanext/dcat/schemas/dcat_ap_recommended.yaml | 35 +++++++- .../dcat_ap/test_euro_dcatap_profile_parse.py | 6 ++ .../test_schemaorg_profile_serialize.py | 45 +++++++--- ckanext/dcat/utils.py | 6 ++ examples/dcat/dataset.rdf | 9 ++ 9 files changed, 196 insertions(+), 90 deletions(-) diff --git a/ckanext/dcat/converters.py b/ckanext/dcat/converters.py index 2e27a0ee..dca72865 100644 --- a/ckanext/dcat/converters.py +++ b/ckanext/dcat/converters.py @@ -39,6 +39,22 @@ def dcat_to_ckan(dcat_dict): 'value': dcat_publisher.get('identifier') # This could be a URI like https://ror.org/05wg1m734 }) + dcat_creator = dcat_dict.get('creator') + if isinstance(dcat_creator, basestring): + package_dict['extras'].append({'key': 'dcat_creator_name', 'value': dcat_creator}) + elif isinstance(dcat_creator, dict) and dcat_creator.get('name'): + if dcat_creator.get('name'): + package_dict['extras'].append({'key': 'dcat_creator_name', 'value': dcat_creator.get('name')}) + + if dcat_creator.get('mbox'): + package_dict['extras'].append({'key': 'dcat_creator_email', 'value': dcat_creator.get('mbox')}) + + if dcat_creator.get('identifier'): + package_dict['extras'].append({ + 'key': 'dcat_creator_id', + 'value': dcat_creator.get('identifier') + }) + package_dict['extras'].append({ 'key': 'language', 'value': ','.join(dcat_dict.get('language', [])) @@ -64,20 +80,20 @@ def dcat_to_ckan(dcat_dict): def ckan_to_dcat(package_dict): - dcat_dict = {} dcat_dict['title'] = package_dict.get('title') dcat_dict['description'] = package_dict.get('notes') dcat_dict['landingPage'] = package_dict.get('url') - + # Keywords dcat_dict['keyword'] = [] for tag in package_dict.get('tags', []): dcat_dict['keyword'].append(tag['name']) - + # Publisher dcat_dict['publisher'] = {} + dcat_dict['creator'] = {} for extra in package_dict.get('extras', []): if extra['key'] in ['dcat_issued', 'dcat_modified']: @@ -86,20 +102,42 @@ def ckan_to_dcat(package_dict): elif extra['key'] == 'language': dcat_dict['language'] = extra['value'].split(',') + # Publisher fields elif extra['key'] == 'dcat_publisher_name': dcat_dict['publisher']['name'] = extra['value'] elif extra['key'] == 'dcat_publisher_email': dcat_dict['publisher']['mbox'] = extra['value'] + elif extra['key'] == 'dcat_publisher_id': + dcat_dict['publisher']['identifier'] = extra['value'] + + # Creator fields + elif extra['key'] == 'dcat_creator_name': + dcat_dict['creator']['name'] = extra['value'] + + elif extra['key'] == 'dcat_creator_email': + dcat_dict['creator']['mbox'] = extra['value'] + + elif extra['key'] == 'dcat_creator_id': + dcat_dict['creator']['identifier'] = extra['value'] + + # Identifier elif extra['key'] == 'guid': dcat_dict['identifier'] = extra['value'] + # Fallback for publisher (if no name in extras, use maintainer) if not dcat_dict['publisher'].get('name') and package_dict.get('maintainer'): dcat_dict['publisher']['name'] = package_dict.get('maintainer') if package_dict.get('maintainer_email'): dcat_dict['publisher']['mbox'] = package_dict.get('maintainer_email') + # Fallback for creator (if no name in extras, optionally use author) + if not dcat_dict['creator'].get('name') and package_dict.get('author'): + dcat_dict['creator']['name'] = package_dict.get('author') + if package_dict.get('author_email'): + dcat_dict['creator']['mbox'] = package_dict.get('author_email') + dcat_dict['distribution'] = [] for resource in package_dict.get('resources', []): distribution = { diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index fd5af492..9169abbf 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -419,58 +419,37 @@ def _insert_or_update_temporal(self, dataset_dict, key, value): else: dataset_dict["extras"].append({"key": key, "value": value}) - def _publisher(self, subject, predicate): + def _agent_details(self, subject, predicate): """ - Returns a dict with details about a dct:publisher entity, a foaf:Agent + Returns a dict with details about a dct:publisher or dct:creator entity, a foaf:Agent Both subject and predicate must be rdflib URIRef or BNode objects Examples: - + or Publishing Organization for dataset 1 contact@some.org http://some.org - - - { - 'uri': 'http://orgs.vocab.org/some-org', - 'name': 'Publishing Organization for dataset 1', - 'email': 'contact@some.org', - 'url': 'http://some.org', - 'type': 'http://purl.org/adms/publishertype/NonProfitOrganisation', - } - - - - { - 'uri': 'http://publications.europa.eu/resource/authority/corporate-body/EURCOU' - } - Returns keys for uri, name, email, url and type with the values set to - an empty string if they could not be found + Returns keys for uri, name, email, url, type, and identifier with the values set to + an empty string if they could not be found. """ - publisher = {} + agent_details = {} for agent in self.g.objects(subject, predicate): + agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" + agent_details["name"] = self._object_value(agent, FOAF.name) + agent_details["email"] = self._object_value(agent, FOAF.mbox) + agent_details["url"] = self._object_value(agent, FOAF.homepage) + agent_details["type"] = self._object_value(agent, DCT.type) + agent_details['identifier'] = self._object_value(agent, DCT.identifier) - publisher["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" - - publisher["name"] = self._object_value(agent, FOAF.name) - - publisher["email"] = self._object_value(agent, FOAF.mbox) - - publisher["url"] = self._object_value(agent, FOAF.homepage) - - publisher["type"] = self._object_value(agent, DCT.type) - - publisher['identifier'] = self._object_value(agent, DCT.identifier) - - return publisher + return agent_details def _contact_details(self, subject, predicate): """ @@ -1136,7 +1115,7 @@ def _extract_catalog_dict(self, catalog_ref): out.append( { "key": "source_catalog_publisher", - "value": json.dumps(self._publisher(catalog_ref, DCT.publisher)), + "value": json.dumps(self._agent_details(catalog_ref, DCT.publisher)), } ) return out diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index 110e04dd..9d7fa9ce 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -122,13 +122,21 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): ) # Publisher - publisher = self._publisher(dataset_ref, DCT.publisher) + publisher = self._agent_details(dataset_ref, DCT.publisher) for key in ("uri", "name", "email", "url", "type", "identifier"): if publisher.get(key): dataset_dict["extras"].append( {"key": "publisher_{0}".format(key), "value": publisher.get(key)} ) + # Creator + creator = self._agent_details(dataset_ref, DCT.creator) + for key in ("uri", "name", "email", "url", "type", "identifier"): + if creator.get(key): + dataset_dict["extras"].append( + {"key": "creator_{0}".format(key), "value": creator.get(key)} + ) + # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) if start: diff --git a/ckanext/dcat/profiles/schemaorg.py b/ckanext/dcat/profiles/schemaorg.py index 88e30be6..3ad53a7b 100644 --- a/ckanext/dcat/profiles/schemaorg.py +++ b/ckanext/dcat/profiles/schemaorg.py @@ -50,7 +50,10 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._list_fields_graph(dataset_ref, dataset_dict) # Publisher - self._publisher_graph(dataset_ref, dataset_dict) + self._agent_graph(dataset_ref, dataset_dict, SCHEMA.publisher, "publisher") + + # Creator + self._agent_graph(dataset_ref, dataset_dict, SCHEMA.creator, "creator") # Temporal self._temporal_graph(dataset_ref, dataset_dict) @@ -156,74 +159,73 @@ def _list_fields_graph(self, dataset_ref, dataset_dict): ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) - def _publisher_graph(self, dataset_ref, dataset_dict): + def _agent_graph(self, dataset_ref, dataset_dict, agent_type, schema_property_prefix): + uri_key = f"{schema_property_prefix}_uri" + name_key = f"{schema_property_prefix}_name" + url_key = f"{schema_property_prefix}_url" + email_key = f"{schema_property_prefix}_email" + identifier_key = f"{schema_property_prefix}_identifier" + if any( - [ - self._get_dataset_value(dataset_dict, "publisher_uri"), - self._get_dataset_value(dataset_dict, "publisher_name"), - dataset_dict.get("organization"), - ] + [ + self._get_dataset_value(dataset_dict, uri_key), + self._get_dataset_value(dataset_dict, name_key), + dataset_dict.get("organization"), + ] ): - - publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) - publisher_name = self._get_dataset_value(dataset_dict, "publisher_name") - if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) + agent_uri = self._get_dataset_value(dataset_dict, uri_key) + agent_uri_fallback = publisher_uri_organization_fallback(dataset_dict) + agent_name = self._get_dataset_value(dataset_dict, name_key) + + if agent_uri: + agent_details = CleanedURIRef(agent_uri) + elif not agent_name and agent_uri_fallback: + agent_details = CleanedURIRef(agent_uri_fallback) else: - # No publisher_uri - publisher_details = BNode() + agent_details = BNode() - self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) - self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) + self.g.add((agent_details, RDF.type, SCHEMA.Organization)) + self.g.add((dataset_ref, agent_type, agent_details)) - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. if ( - not publisher_name - and not publisher_uri - and dataset_dict.get("organization") + not agent_name + and not agent_uri + and dataset_dict.get("organization") ): - publisher_name = dataset_dict["organization"]["title"] - self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) + agent_name = dataset_dict["organization"]["title"] + self.g.add((agent_details, SCHEMA.name, Literal(agent_name))) contact_point = BNode() self.g.add((contact_point, RDF.type, SCHEMA.ContactPoint)) - self.g.add((publisher_details, SCHEMA.contactPoint, contact_point)) - + self.g.add((agent_details, SCHEMA.contactPoint, contact_point)) self.g.add((contact_point, SCHEMA.contactType, Literal("customer service"))) - publisher_url = self._get_dataset_value(dataset_dict, "publisher_url") - if not publisher_url and dataset_dict.get("organization"): - publisher_url = dataset_dict["organization"].get("url") or config.get( + agent_url = self._get_dataset_value(dataset_dict, url_key) + if not agent_url and dataset_dict.get("organization"): + agent_url = dataset_dict["organization"].get("url") or config.get( "ckan.site_url" ) + self.g.add((contact_point, SCHEMA.url, Literal(agent_url))) - self.g.add((contact_point, SCHEMA.url, Literal(publisher_url))) items = [ ( - "publisher_email", + email_key, SCHEMA.email, ["contact_email", "maintainer_email", "author_email"], Literal, ), ( - "publisher_name", + name_key, SCHEMA.name, ["contact_name", "maintainer", "author"], Literal, ), ] - self._add_triples_from_dict(dataset_dict, contact_point, items) - publisher_identifier = self._get_dataset_value(dataset_dict, "publisher_identifier") - if publisher_identifier: - self.g.add((publisher_details, SCHEMA.identifier, Literal(publisher_identifier))) + agent_identifier = self._get_dataset_value(dataset_dict, identifier_key) + if agent_identifier: + self.g.add((agent_details, SCHEMA.identifier, Literal(agent_identifier))) def _temporal_graph(self, dataset_ref, dataset_dict): start = self._get_dataset_value(dataset_dict, "temporal_start") diff --git a/ckanext/dcat/schemas/dcat_ap_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_recommended.yaml index 883f337d..0b3a9254 100644 --- a/ckanext/dcat/schemas/dcat_ap_recommended.yaml +++ b/ckanext/dcat/schemas/dcat_ap_recommended.yaml @@ -70,9 +70,42 @@ dataset_fields: - field_name: identifier label: Identifier help_text: Unique identifier for the publisher, such as a ROR ID. - help_text: Entity responsible for making the dataset available. +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + + help_text: Entity responsible for creating the dataset. + - field_name: license_id label: License form_snippet: license.html diff --git a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py index d2b84ae4..3300f8c3 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap/test_euro_dcatap_profile_parse.py @@ -114,6 +114,12 @@ def _get_extra_value_as_list(key): assert _get_extra_value('publisher_url') == 'http://some.org' assert _get_extra_value('publisher_type') == 'http://purl.org/adms/publishertype/NonProfitOrganisation' assert _get_extra_value('publisher_identifier') == 'https://ror.org/05wg1m734' + assert _get_extra_value('creator_uri') == 'http://example.org/creator-org' + assert _get_extra_value('creator_name') == 'Creating Organization for dataset 1' + assert _get_extra_value('creator_email') == 'creator@example.org' + assert _get_extra_value('creator_url') == 'http://example.org' + assert _get_extra_value('creator_type') == 'http://purl.org/adms/publishertype/NonProfitOrganisation' + assert _get_extra_value('creator_identifier') == 'https://ror.org/05wg1m735' assert _get_extra_value('contact_name') == 'Point of Contact' # mailto gets removed for storage and is added again on output assert _get_extra_value('contact_email') == 'contact@some.org' diff --git a/ckanext/dcat/tests/profiles/schemaorg/test_schemaorg_profile_serialize.py b/ckanext/dcat/tests/profiles/schemaorg/test_schemaorg_profile_serialize.py index eb9a5eb4..dec37644 100644 --- a/ckanext/dcat/tests/profiles/schemaorg/test_schemaorg_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/schemaorg/test_schemaorg_profile_serialize.py @@ -90,7 +90,7 @@ def test_graph_from_dataset(self): for value in values: assert self._triple(g, dataset_ref, item[1], item[2](value)) - def test_publisher_extras(self): + def test_publisher_and_creator_extras(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', @@ -100,16 +100,24 @@ def test_publisher_extras(self): 'title': 'Example Publisher from Org', }, 'extras': [ + # Publisher fields {'key': 'publisher_uri', 'value': 'http://example.com/publisher'}, {'key': 'publisher_name', 'value': 'Example Publisher'}, {'key': 'publisher_email', 'value': 'publisher@example.com'}, {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'}, {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'}, {'key': 'publisher_identifier', 'value': 'https://ror.org/05wg1m734'}, - ] - + # Creator fields + {'key': 'creator_uri', 'value': 'http://example.com/creator'}, + {'key': 'creator_name', 'value': 'Example Creator'}, + {'key': 'creator_email', 'value': 'creator@example.com'}, + {'key': 'creator_url', 'value': 'http://example.com/creator/home'}, + {'key': 'creator_type', 'value': 'http://purl.org/adms/publishertype/NonProfitOrganisation'}, + {'key': 'creator_identifier', 'value': 'https://ror.org/05wg1m735'}, + ] } + extras = self._extras(dataset) s = RDFSerializer(profiles=['schemaorg']) @@ -117,6 +125,7 @@ def test_publisher_extras(self): dataset_ref = s.graph_from_dataset(dataset) + # Publisher validation publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2] assert publisher assert str(publisher) == extras['publisher_uri'] @@ -124,13 +133,29 @@ def test_publisher_extras(self): assert self._triple(g, publisher, SCHEMA.name, extras['publisher_name']) assert self._triple(g, publisher, SCHEMA.identifier, extras['publisher_identifier']) - contact_point = self._triple(g, publisher, SCHEMA.contactPoint, None)[2] - assert contact_point - assert self._triple(g, contact_point, RDF.type, SCHEMA.ContactPoint) - assert self._triple(g, contact_point, SCHEMA.name, extras['publisher_name']) - assert self._triple(g, contact_point, SCHEMA.email, extras['publisher_email']) - assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url']) - assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service') + contact_point_publisher = self._triple(g, publisher, SCHEMA.contactPoint, None)[2] + assert contact_point_publisher + assert self._triple(g, contact_point_publisher, RDF.type, SCHEMA.ContactPoint) + assert self._triple(g, contact_point_publisher, SCHEMA.name, extras['publisher_name']) + assert self._triple(g, contact_point_publisher, SCHEMA.email, extras['publisher_email']) + assert self._triple(g, contact_point_publisher, SCHEMA.url, extras['publisher_url']) + assert self._triple(g, contact_point_publisher, SCHEMA.contactType, 'customer service') + + # Creator validation + creator = self._triple(g, dataset_ref, SCHEMA.creator, None)[2] + assert creator + assert str(creator) == extras['creator_uri'] + assert self._triple(g, creator, RDF.type, SCHEMA.Organization) + assert self._triple(g, creator, SCHEMA.name, extras['creator_name']) + assert self._triple(g, creator, SCHEMA.identifier, extras['creator_identifier']) + + contact_point_creator = self._triple(g, creator, SCHEMA.contactPoint, None)[2] + assert contact_point_creator + assert self._triple(g, contact_point_creator, RDF.type, SCHEMA.ContactPoint) + assert self._triple(g, contact_point_creator, SCHEMA.name, extras['creator_name']) + assert self._triple(g, contact_point_creator, SCHEMA.email, extras['creator_email']) + assert self._triple(g, contact_point_creator, SCHEMA.url, extras['creator_url']) + assert self._triple(g, contact_point_creator, SCHEMA.contactType, 'customer service') def test_publisher_no_uri(self): dataset = { diff --git a/ckanext/dcat/utils.py b/ckanext/dcat/utils.py index de17e9ad..d5fd1749 100644 --- a/ckanext/dcat/utils.py +++ b/ckanext/dcat/utils.py @@ -79,6 +79,12 @@ def field_labels(): 'publisher_url': _('Publisher URL'), 'publisher_type': _('Publisher type'), 'publisher_identifier': _('Publisher identifier'), + 'creator_uri': _('Creator URI'), + 'creator_name': _('Creator name'), + 'creator_email': _('Creator email'), + 'creator_url': _('Creator URL'), + 'creator_type': _('Creator type'), + 'creator_identifier': _('Creator identifier'), 'contact_name': _('Contact name'), 'contact_email': _('Contact email'), 'contact_uri': _('Contact URI'), diff --git a/examples/dcat/dataset.rdf b/examples/dcat/dataset.rdf index 9e117752..8cd9619f 100644 --- a/examples/dcat/dataset.rdf +++ b/examples/dcat/dataset.rdf @@ -78,6 +78,15 @@ + + + Creating Organization for dataset 1 + creator@example.org + http://example.org + + + + Some website From 8bfce4b687c0984f6f308a48f32eb26fd5dd0116 Mon Sep 17 00:00:00 2001 From: Hans-Chrstian Date: Thu, 12 Sep 2024 13:48:10 +0200 Subject: [PATCH 2/3] Add missing mappings --- ckanext/dcat/profiles/euro_dcat_ap_base.py | 152 +++++++++++------- .../dcat/profiles/euro_dcat_ap_scheming.py | 110 +++++++------ ckanext/dcat/schemas/dcat_ap_full.yaml | 32 ++++ .../dcat_ap_2/test_scheming_support.py | 41 +++++ docs/mapping.md | 6 + 5 files changed, 234 insertions(+), 107 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index 6774c01c..e82e4d33 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -34,7 +34,6 @@ config = toolkit.config - DISTRIBUTION_LICENSE_FALLBACK_CONFIG = "ckanext.dcat.resource.inherit.license" @@ -51,10 +50,10 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Basic fields for key, predicate in ( - ("title", DCT.title), - ("notes", DCT.description), - ("url", DCAT.landingPage), - ("version", OWL.versionInfo), + ("title", DCT.title), + ("notes", DCT.description), + ("url", DCAT.landingPage), + ("version", OWL.versionInfo), ): value = self._object_value(dataset_ref, predicate) if value: @@ -79,13 +78,13 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Simple values for key, predicate in ( - ("issued", DCT.issued), - ("modified", DCT.modified), - ("identifier", DCT.identifier), - ("version_notes", ADMS.versionNotes), - ("frequency", DCT.accrualPeriodicity), - ("provenance", DCT.provenance), - ("dcat_type", DCT.type), + ("issued", DCT.issued), + ("modified", DCT.modified), + ("identifier", DCT.identifier), + ("version_notes", ADMS.versionNotes), + ("frequency", DCT.accrualPeriodicity), + ("provenance", DCT.provenance), + ("dcat_type", DCT.type), ): value = self._object_value(dataset_ref, predicate) if value: @@ -93,16 +92,16 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Lists for key, predicate, in ( - ("language", DCT.language), - ("theme", DCAT.theme), - ("alternate_identifier", ADMS.identifier), - ("conforms_to", DCT.conformsTo), - ("documentation", FOAF.page), - ("related_resource", DCT.relation), - ("has_version", DCT.hasVersion), - ("is_version_of", DCT.isVersionOf), - ("source", DCT.source), - ("sample", ADMS.sample), + ("language", DCT.language), + ("theme", DCAT.theme), + ("alternate_identifier", ADMS.identifier), + ("conforms_to", DCT.conformsTo), + ("documentation", FOAF.page), + ("related_resource", DCT.relation), + ("has_version", DCT.hasVersion), + ("is_version_of", DCT.isVersionOf), + ("source", DCT.source), + ("sample", ADMS.sample), ): values = self._object_value_list(dataset_ref, predicate) if values: @@ -178,14 +177,14 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Simple values for key, predicate in ( - ("name", DCT.title), - ("description", DCT.description), - ("access_url", DCAT.accessURL), - ("download_url", DCAT.downloadURL), - ("issued", DCT.issued), - ("modified", DCT.modified), - ("status", ADMS.status), - ("license", DCT.license), + ("name", DCT.title), + ("description", DCT.description), + ("access_url", DCAT.accessURL), + ("download_url", DCAT.downloadURL), + ("issued", DCT.issued), + ("modified", DCT.modified), + ("status", ADMS.status), + ("license", DCT.license), ): value = self._object_value(distribution, predicate) if value: @@ -196,9 +195,9 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): ) or self._object_value(distribution, DCAT.accessURL) # Lists for key, predicate in ( - ("language", DCT.language), - ("documentation", FOAF.page), - ("conforms_to", DCT.conformsTo), + ("language", DCT.language), + ("documentation", FOAF.page), + ("conforms_to", DCT.conformsTo), ): values = self._object_value_list(distribution, predicate) if values: @@ -253,12 +252,11 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # versions of the ckanext-dcat parsers for extra in dataset_dict["extras"]: if extra["key"] in ( - "issued", - "modified", - "publisher_name", - "publisher_email", + "issued", + "modified", + "publisher_name", + "publisher_email", ): - extra["key"] = "dcat_" + extra["key"] if extra["key"] == "language": @@ -317,15 +315,15 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # Contact details if any( - [ - self._get_dataset_value(dataset_dict, "contact_uri"), - self._get_dataset_value(dataset_dict, "contact_name"), - self._get_dataset_value(dataset_dict, "contact_email"), - self._get_dataset_value(dataset_dict, "maintainer"), - self._get_dataset_value(dataset_dict, "maintainer_email"), - self._get_dataset_value(dataset_dict, "author"), - self._get_dataset_value(dataset_dict, "author_email"), - ] + [ + self._get_dataset_value(dataset_dict, "contact_uri"), + self._get_dataset_value(dataset_dict, "contact_name"), + self._get_dataset_value(dataset_dict, "contact_email"), + self._get_dataset_value(dataset_dict, "maintainer"), + self._get_dataset_value(dataset_dict, "maintainer_email"), + self._get_dataset_value(dataset_dict, "author"), + self._get_dataset_value(dataset_dict, "author_email"), + ] ): contact_uri = self._get_dataset_value(dataset_dict, "contact_uri") @@ -362,10 +360,10 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # Scheming publisher field: will be handled in a separate profile pass elif any( - [ - self._get_dataset_value(dataset_dict, "publisher_uri"), - self._get_dataset_value(dataset_dict, "publisher_name"), - ] + [ + self._get_dataset_value(dataset_dict, "publisher_uri"), + self._get_dataset_value(dataset_dict, "publisher_name"), + ] ): # Legacy publisher_* extras publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") @@ -420,6 +418,48 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): ] self._add_triples_from_dict(publisher_details, publisher_ref, items) + # Creator + creator_ref = None + + if dataset_dict.get("creator"): + # Scheming publisher field: will be handled in a separate profile + pass + elif any( + [ + self._get_dataset_value(dataset_dict, "creator_uri"), + self._get_dataset_value(dataset_dict, "creator_name"), + ] + ): + # Legacy creator_* extras + creator_uri = self._get_dataset_value(dataset_dict, "creator_uri") + creator_name = self._get_dataset_value(dataset_dict, "creator_name") + if creator_uri: + creator_ref = CleanedURIRef(creator_uri) + else: + # No creator_uri + creator_ref = BNode() + + creator_details = { + "name": creator_name, + "email": self._get_dataset_value(dataset_dict, "creator_email"), + "url": self._get_dataset_value(dataset_dict, "creator_url"), + "type": self._get_dataset_value(dataset_dict, "creator_type"), + "identifier": self._get_dataset_value(dataset_dict, "creator_identifier"), + } + + # Add to graph + if creator_ref: + g.add((creator_ref, RDF.type, FOAF.Agent)) + g.add((dataset_ref, DCT.creator, creator_ref)) # Use DCT.creator for creator + items = [ + ("name", FOAF.name, None, Literal), + ("email", FOAF.mbox, None, Literal), + ("url", FOAF.homepage, None, URIRef), + ("type", DCT.type, None, URIRefOrLiteral), + ("identifier", DCT.identifier, None, URIRefOrLiteral), + ] + self._add_triples_from_dict(creator_details, creator_ref, items) + # Temporal start = self._get_dataset_value(dataset_dict, "temporal_start") end = self._get_dataset_value(dataset_dict, "temporal_end") @@ -452,11 +492,11 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): resource_license_fallback = None if toolkit.asbool(config.get(DISTRIBUTION_LICENSE_FALLBACK_CONFIG, False)): if "license_id" in dataset_dict and isinstance( - URIRefOrLiteral(dataset_dict["license_id"]), URIRef + URIRefOrLiteral(dataset_dict["license_id"]), URIRef ): resource_license_fallback = dataset_dict["license_id"] elif "license_url" in dataset_dict and isinstance( - URIRefOrLiteral(dataset_dict["license_url"]), URIRef + URIRefOrLiteral(dataset_dict["license_url"]), URIRef ): resource_license_fallback = dataset_dict["license_url"] @@ -519,9 +559,9 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # check which type is appropriate. if fmt and (not mimetype or mimetype == fmt): if ( - "iana.org/assignments/media-types" in fmt - or not fmt.startswith("http") - and "/" in fmt + "iana.org/assignments/media-types" in fmt + or not fmt.startswith("http") + and "/" in fmt ): # output format value as dcat:mediaType instead of dct:format mimetype = fmt diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 8d0ffb79..62dcf61d 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -123,12 +123,8 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): """ Add triples to the graph from new repeating subfields """ - - def _not_empty_dict(data_dict): - return any(data_dict.values()) - contact = dataset_dict.get("contact") - if isinstance(contact, list) and len(contact) and _not_empty_dict(contact[0]): + if isinstance(contact, list) and len(contact) and self._not_empty_dict(contact[0]): for item in contact: contact_uri = item.get("uri") if contact_uri: @@ -150,56 +146,14 @@ def _not_empty_dict(data_dict): value_modifier=self._add_mailto, ) - publisher = dataset_dict.get("publisher") - if ( - isinstance(publisher, list) - and len(publisher) - and _not_empty_dict(publisher[0]) - ): - publisher = publisher[0] - publisher_uri = publisher.get("uri") - if publisher_uri: - publisher_ref = CleanedURIRef(publisher_uri) - else: - publisher_ref = BNode() - - self.g.add((publisher_ref, RDF.type, FOAF.Agent)) - self.g.add((dataset_ref, DCT.publisher, publisher_ref)) - - self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name") - self._add_triple_from_dict( - publisher, publisher_ref, FOAF.homepage, "url", _type=URIRef - ) - self._add_triple_from_dict( - publisher, - publisher_ref, - DCT.type, - "type", - _type=URIRefOrLiteral, - # TODO: fix prefLabel stuff - # _class=SKOS.Concept, - ) - self._add_triple_from_dict( - publisher, - publisher_ref, - VCARD.hasEmail, - "email", - _type=URIRef, - value_modifier=self._add_mailto, - ) - self._add_triple_from_dict( - publisher, - publisher_ref, - DCT.identifier, - "identifier", - _type=URIRefOrLiteral - ) + self._add_agent(dataset_ref, dataset_dict, "publisher", DCT.publisher) + self._add_agent(dataset_ref, dataset_dict, "creator", DCT.creator) temporal = dataset_dict.get("temporal_coverage") if ( isinstance(temporal, list) and len(temporal) - and _not_empty_dict(temporal[0]) + and self._not_empty_dict(temporal[0]) ): for item in temporal: temporal_ref = BNode() @@ -211,7 +165,7 @@ def _not_empty_dict(data_dict): self.g.add((dataset_ref, DCT.temporal, temporal_ref)) spatial = dataset_dict.get("spatial_coverage") - if isinstance(spatial, list) and len(spatial) and _not_empty_dict(spatial[0]): + if isinstance(spatial, list) and len(spatial) and self._not_empty_dict(spatial[0]): for item in spatial: if item.get("uri"): spatial_ref = CleanedURIRef(item["uri"]) @@ -243,3 +197,57 @@ def _not_empty_dict(data_dict): ) except ValueError: pass + + def _add_agent(self, dataset_ref, dataset_dict, agent_key, rdf_predicate): + """ + Adds an agent (publisher or creator) to the RDF graph. + + :param dataset_ref: The RDF reference of the dataset + :param dataset_dict: The dataset dictionary containing agent information + :param agent_key: 'publisher' or 'creator' to specify the agent + :param rdf_predicate: The RDF predicate (DCT.publisher or DCT.creator) + """ + agent = dataset_dict.get(agent_key) + if ( + isinstance(agent, list) + and len(agent) + and self._not_empty_dict(agent[0]) + ): + agent = agent[0] + agent_uri = agent.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, RDF.type, FOAF.Agent)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") + self._add_triple_from_dict(agent, agent_ref, FOAF.homepage, "url", _type=URIRef) + self._add_triple_from_dict( + agent, + agent_ref, + DCT.type, + "type", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + agent, + agent_ref, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + self._add_triple_from_dict( + agent, + agent_ref, + DCT.identifier, + "identifier", + _type=URIRefOrLiteral + ) + + @staticmethod + def _not_empty_dict(data_dict): + return any(data_dict.values()) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 36508d9a..8a831852 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -72,6 +72,38 @@ dataset_fields: help_text: Unique identifier for the publisher, such as a ROR ID. help_text: Entity responsible for making the dataset available. +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + - field_name: license_id label: License form_snippet: license.html diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py index b249d600..8095202f 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py @@ -97,6 +97,15 @@ def test_e2e_ckan_to_dcat(self): "identifier": "http://example.org/publisher-id", }, ], + "creator": [ + { + "name": "Test Creator", + "email": "creator@example.org", + "url": "https://example.org/creator", + "type": "person", + "identifier": "http://example.org/creator-id", + } + ], "temporal_coverage": [ {"start": "1905-03-01", "end": "2013-01-05"}, {"start": "2024-04-10", "end": "2024-05-29"}, @@ -309,6 +318,38 @@ def test_e2e_ckan_to_dcat(self): URIRef(dataset_dict["publisher"][0]["identifier"]) ) + creator = [t for t in g.triples((dataset_ref, DCT.creator, None))] + + assert len(creator) == 1 + assert self._triple( + g, creator[0][2], FOAF.name, dataset_dict["creator"][0]["name"] + ) + assert self._triple( + g, + creator[0][2], + VCARD.hasEmail, + URIRef("mailto:" + dataset_dict["creator"][0]["email"]), + ) + assert self._triple( + g, + creator[0][2], + FOAF.homepage, + URIRef(dataset_dict["creator"][0]["url"]), + ) + assert self._triple( + g, + creator[0][2], + DCT.type, + dataset_dict["creator"][0]["type"], + ) + assert self._triple( + g, + creator[0][2], + DCT.identifier, + URIRef(dataset_dict["creator"][0]["identifier"]) + ) + + temporal = [t for t in g.triples((dataset_ref, DCT.temporal, None))] assert len(temporal) == len(dataset["temporal_coverage"]) diff --git a/docs/mapping.md b/docs/mapping.md index 89057ab2..ce4048c4 100644 --- a/docs/mapping.md +++ b/docs/mapping.md @@ -54,6 +54,12 @@ some cases the way metadata is stored internally and presented at the CKAN API l | foaf:Agent | foaf:homepage | custom:publisher_url | | text | | | foaf:Agent | dct:type | custom:publisher_type | | text | | | foaf:Agent | dct:identifier | custom:publisher_id | | text | +| dcat:Dataset | dct:creator | custom:creator_uri | | text | See [URIs](mapping.md#uris) and [Publisher](#contact-points-and-publisher) | +| foaf:Agent | foaf:name | custom:creator_name | | text | | +| foaf:Agent | foaf:mbox | custom:creator_email | organization:title | text | | +| foaf:Agent | foaf:homepage | custom:creator_url | | text | | +| foaf:Agent | dct:type | custom:creator_type | | text | | +| foaf:Agent | dct:identifier | custom:creator_id | | text | | dcat:Dataset | dcat:contactPoint | custom:contact_uri | | text | See [URIs](mapping.md#uris) and [Contact points](#contact-points-and-publisher) | | vcard:Kind | vcard:fn | custom:contact_name | maintainer, author | text | | | vcard:Kind | vcard:hasEmail | custom:contact_email | maintainer_email, author_email | text | | From d8461e2877fa94b28ccefcc49c6c977e9b1b1fbb Mon Sep 17 00:00:00 2001 From: Hans-Chrstian Date: Thu, 12 Sep 2024 21:17:39 +0200 Subject: [PATCH 3/3] feat: Add support for DCAT creator field in dataset metadata - Added fields to store creator details (name, email, URL, and identifier) in the DCAT profile. - Implemented functionality to serialize and deserialize creator information similar to the publisher. - Updated RDF generation logic to include creator fields in the output graph. - Enhanced unit tests to verify proper handling and serialization of creator metadata. --- ckanext/dcat/converters.py | 16 +-- ckanext/dcat/profiles/euro_dcat_ap_base.py | 116 +++++++++--------- ckanext/dcat/profiles/schemaorg.py | 16 +-- ckanext/dcat/schemas/dcat_ap_recommended.yaml | 34 ----- .../tests/profiles/base/test_base_profile.py | 4 +- examples/ckan/ckan_dataset.json | 2 + examples/ckan/dataset.json | 18 ++- examples/ckan/full_ckan_dataset.json | 4 +- 8 files changed, 92 insertions(+), 118 deletions(-) diff --git a/ckanext/dcat/converters.py b/ckanext/dcat/converters.py index 899b72dc..8129f7ad 100644 --- a/ckanext/dcat/converters.py +++ b/ckanext/dcat/converters.py @@ -29,8 +29,8 @@ def dcat_to_ckan(dcat_dict): elif isinstance(dcat_publisher, dict) and dcat_publisher.get('name'): package_dict['extras'].append({'key': 'dcat_publisher_name', 'value': dcat_publisher.get('name')}) - if dcat_publisher.get('mbox'): - package_dict['extras'].append({'key': 'dcat_publisher_email', 'value': dcat_publisher.get('mbox')}) + if dcat_publisher.get('email'): + package_dict['extras'].append({'key': 'dcat_publisher_email', 'value': dcat_publisher.get('email')}) if dcat_publisher.get('identifier'): package_dict['extras'].append({ @@ -45,8 +45,8 @@ def dcat_to_ckan(dcat_dict): if dcat_creator.get('name'): package_dict['extras'].append({'key': 'dcat_creator_name', 'value': dcat_creator.get('name')}) - if dcat_creator.get('mbox'): - package_dict['extras'].append({'key': 'dcat_creator_email', 'value': dcat_creator.get('mbox')}) + if dcat_creator.get('email'): + package_dict['extras'].append({'key': 'dcat_creator_email', 'value': dcat_creator.get('email')}) if dcat_creator.get('identifier'): package_dict['extras'].append({ @@ -106,7 +106,7 @@ def ckan_to_dcat(package_dict): dcat_dict['publisher']['name'] = extra['value'] elif extra['key'] == 'dcat_publisher_email': - dcat_dict['publisher']['mbox'] = extra['value'] + dcat_dict['publisher']['email'] = extra['value'] elif extra['key'] == 'dcat_publisher_id': dcat_dict['publisher']['identifier'] = extra['value'] @@ -116,7 +116,7 @@ def ckan_to_dcat(package_dict): dcat_dict['creator']['name'] = extra['value'] elif extra['key'] == 'dcat_creator_email': - dcat_dict['creator']['mbox'] = extra['value'] + dcat_dict['creator']['email'] = extra['value'] elif extra['key'] == 'dcat_creator_id': dcat_dict['creator']['identifier'] = extra['value'] @@ -129,13 +129,13 @@ def ckan_to_dcat(package_dict): if not dcat_dict['publisher'].get('name') and package_dict.get('maintainer'): dcat_dict['publisher']['name'] = package_dict.get('maintainer') if package_dict.get('maintainer_email'): - dcat_dict['publisher']['mbox'] = package_dict.get('maintainer_email') + dcat_dict['publisher']['email'] = package_dict.get('maintainer_email') # Fallback for creator (if no name in extras, optionally use author) if not dcat_dict['creator'].get('name') and package_dict.get('author'): dcat_dict['creator']['name'] = package_dict.get('author') if package_dict.get('author_email'): - dcat_dict['creator']['mbox'] = package_dict.get('author_email') + dcat_dict['creator']['email'] = package_dict.get('author_email') dcat_dict['distribution'] = [] for resource in package_dict.get('resources', []): diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index e82e4d33..f1db48b6 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -50,10 +50,10 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Basic fields for key, predicate in ( - ("title", DCT.title), - ("notes", DCT.description), - ("url", DCAT.landingPage), - ("version", OWL.versionInfo), + ("title", DCT.title), + ("notes", DCT.description), + ("url", DCAT.landingPage), + ("version", OWL.versionInfo), ): value = self._object_value(dataset_ref, predicate) if value: @@ -78,13 +78,13 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Simple values for key, predicate in ( - ("issued", DCT.issued), - ("modified", DCT.modified), - ("identifier", DCT.identifier), - ("version_notes", ADMS.versionNotes), - ("frequency", DCT.accrualPeriodicity), - ("provenance", DCT.provenance), - ("dcat_type", DCT.type), + ("issued", DCT.issued), + ("modified", DCT.modified), + ("identifier", DCT.identifier), + ("version_notes", ADMS.versionNotes), + ("frequency", DCT.accrualPeriodicity), + ("provenance", DCT.provenance), + ("dcat_type", DCT.type), ): value = self._object_value(dataset_ref, predicate) if value: @@ -92,16 +92,16 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Lists for key, predicate, in ( - ("language", DCT.language), - ("theme", DCAT.theme), - ("alternate_identifier", ADMS.identifier), - ("conforms_to", DCT.conformsTo), - ("documentation", FOAF.page), - ("related_resource", DCT.relation), - ("has_version", DCT.hasVersion), - ("is_version_of", DCT.isVersionOf), - ("source", DCT.source), - ("sample", ADMS.sample), + ("language", DCT.language), + ("theme", DCAT.theme), + ("alternate_identifier", ADMS.identifier), + ("conforms_to", DCT.conformsTo), + ("documentation", FOAF.page), + ("related_resource", DCT.relation), + ("has_version", DCT.hasVersion), + ("is_version_of", DCT.isVersionOf), + ("source", DCT.source), + ("sample", ADMS.sample), ): values = self._object_value_list(dataset_ref, predicate) if values: @@ -177,14 +177,14 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # Simple values for key, predicate in ( - ("name", DCT.title), - ("description", DCT.description), - ("access_url", DCAT.accessURL), - ("download_url", DCAT.downloadURL), - ("issued", DCT.issued), - ("modified", DCT.modified), - ("status", ADMS.status), - ("license", DCT.license), + ("name", DCT.title), + ("description", DCT.description), + ("access_url", DCAT.accessURL), + ("download_url", DCAT.downloadURL), + ("issued", DCT.issued), + ("modified", DCT.modified), + ("status", ADMS.status), + ("license", DCT.license), ): value = self._object_value(distribution, predicate) if value: @@ -195,9 +195,9 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): ) or self._object_value(distribution, DCAT.accessURL) # Lists for key, predicate in ( - ("language", DCT.language), - ("documentation", FOAF.page), - ("conforms_to", DCT.conformsTo), + ("language", DCT.language), + ("documentation", FOAF.page), + ("conforms_to", DCT.conformsTo), ): values = self._object_value_list(distribution, predicate) if values: @@ -252,10 +252,10 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): # versions of the ckanext-dcat parsers for extra in dataset_dict["extras"]: if extra["key"] in ( - "issued", - "modified", - "publisher_name", - "publisher_email", + "issued", + "modified", + "publisher_name", + "publisher_email", ): extra["key"] = "dcat_" + extra["key"] @@ -315,15 +315,15 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # Contact details if any( - [ - self._get_dataset_value(dataset_dict, "contact_uri"), - self._get_dataset_value(dataset_dict, "contact_name"), - self._get_dataset_value(dataset_dict, "contact_email"), - self._get_dataset_value(dataset_dict, "maintainer"), - self._get_dataset_value(dataset_dict, "maintainer_email"), - self._get_dataset_value(dataset_dict, "author"), - self._get_dataset_value(dataset_dict, "author_email"), - ] + [ + self._get_dataset_value(dataset_dict, "contact_uri"), + self._get_dataset_value(dataset_dict, "contact_name"), + self._get_dataset_value(dataset_dict, "contact_email"), + self._get_dataset_value(dataset_dict, "maintainer"), + self._get_dataset_value(dataset_dict, "maintainer_email"), + self._get_dataset_value(dataset_dict, "author"), + self._get_dataset_value(dataset_dict, "author_email"), + ] ): contact_uri = self._get_dataset_value(dataset_dict, "contact_uri") @@ -360,10 +360,10 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # Scheming publisher field: will be handled in a separate profile pass elif any( - [ - self._get_dataset_value(dataset_dict, "publisher_uri"), - self._get_dataset_value(dataset_dict, "publisher_name"), - ] + [ + self._get_dataset_value(dataset_dict, "publisher_uri"), + self._get_dataset_value(dataset_dict, "publisher_name"), + ] ): # Legacy publisher_* extras publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") @@ -425,10 +425,10 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # Scheming publisher field: will be handled in a separate profile pass elif any( - [ - self._get_dataset_value(dataset_dict, "creator_uri"), - self._get_dataset_value(dataset_dict, "creator_name"), - ] + [ + self._get_dataset_value(dataset_dict, "creator_uri"), + self._get_dataset_value(dataset_dict, "creator_name"), + ] ): # Legacy creator_* extras creator_uri = self._get_dataset_value(dataset_dict, "creator_uri") @@ -492,11 +492,11 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): resource_license_fallback = None if toolkit.asbool(config.get(DISTRIBUTION_LICENSE_FALLBACK_CONFIG, False)): if "license_id" in dataset_dict and isinstance( - URIRefOrLiteral(dataset_dict["license_id"]), URIRef + URIRefOrLiteral(dataset_dict["license_id"]), URIRef ): resource_license_fallback = dataset_dict["license_id"] elif "license_url" in dataset_dict and isinstance( - URIRefOrLiteral(dataset_dict["license_url"]), URIRef + URIRefOrLiteral(dataset_dict["license_url"]), URIRef ): resource_license_fallback = dataset_dict["license_url"] @@ -559,9 +559,9 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): # check which type is appropriate. if fmt and (not mimetype or mimetype == fmt): if ( - "iana.org/assignments/media-types" in fmt - or not fmt.startswith("http") - and "/" in fmt + "iana.org/assignments/media-types" in fmt + or not fmt.startswith("http") + and "/" in fmt ): # output format value as dcat:mediaType instead of dct:format mimetype = fmt diff --git a/ckanext/dcat/profiles/schemaorg.py b/ckanext/dcat/profiles/schemaorg.py index 3ad53a7b..b397f44d 100644 --- a/ckanext/dcat/profiles/schemaorg.py +++ b/ckanext/dcat/profiles/schemaorg.py @@ -167,11 +167,11 @@ def _agent_graph(self, dataset_ref, dataset_dict, agent_type, schema_property_pr identifier_key = f"{schema_property_prefix}_identifier" if any( - [ - self._get_dataset_value(dataset_dict, uri_key), - self._get_dataset_value(dataset_dict, name_key), - dataset_dict.get("organization"), - ] + [ + self._get_dataset_value(dataset_dict, uri_key), + self._get_dataset_value(dataset_dict, name_key), + dataset_dict.get("organization"), + ] ): agent_uri = self._get_dataset_value(dataset_dict, uri_key) agent_uri_fallback = publisher_uri_organization_fallback(dataset_dict) @@ -188,9 +188,9 @@ def _agent_graph(self, dataset_ref, dataset_dict, agent_type, schema_property_pr self.g.add((dataset_ref, agent_type, agent_details)) if ( - not agent_name - and not agent_uri - and dataset_dict.get("organization") + not agent_name + and not agent_uri + and dataset_dict.get("organization") ): agent_name = dataset_dict["organization"]["title"] self.g.add((agent_details, SCHEMA.name, Literal(agent_name))) diff --git a/ckanext/dcat/schemas/dcat_ap_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_recommended.yaml index 0b3a9254..daa7bce4 100644 --- a/ckanext/dcat/schemas/dcat_ap_recommended.yaml +++ b/ckanext/dcat/schemas/dcat_ap_recommended.yaml @@ -72,40 +72,6 @@ dataset_fields: help_text: Unique identifier for the publisher, such as a ROR ID. help_text: Entity responsible for making the dataset available. -- field_name: creator - label: Creator - repeating_label: Creator - repeating_once: true - repeating_subfields: - - - field_name: uri - label: URI - help_text: URI of the creator, if available. - - - field_name: name - label: Name - help_text: Name of the entity or person who created the dataset. - - - field_name: email - label: Email - display_snippet: email.html - help_text: Contact email of the creator. - - - field_name: url - label: URL - display_snippet: link.html - help_text: URL for more information about the creator. - - - field_name: type - label: Type - help_text: Type of creator (e.g., Organization, Person). - - - field_name: identifier - label: Identifier - help_text: Unique identifier for the creator, such as an ORCID or ROR ID. - - help_text: Entity responsible for creating the dataset. - - field_name: license_id label: License form_snippet: license.html diff --git a/ckanext/dcat/tests/profiles/base/test_base_profile.py b/ckanext/dcat/tests/profiles/base/test_base_profile.py index 221c772c..52bf8bc1 100644 --- a/ckanext/dcat/tests/profiles/base/test_base_profile.py +++ b/ckanext/dcat/tests/profiles/base/test_base_profile.py @@ -660,7 +660,7 @@ def test_publisher_foaf(self): p = RDFProfile(g) - publisher = p._publisher(URIRef('http://example.org'), DCT.publisher) + publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher) assert publisher['uri'] == 'http://orgs.vocab.org/some-org' assert publisher['name'] == 'Publishing Organization for dataset 1' @@ -688,7 +688,7 @@ def test_publisher_ref(self): p = RDFProfile(g) - publisher = p._publisher(URIRef('http://example.org'), DCT.publisher) + publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher) assert publisher['uri'] == 'http://orgs.vocab.org/some-org' diff --git a/examples/ckan/ckan_dataset.json b/examples/ckan/ckan_dataset.json index a05f989a..ccdcb2d9 100644 --- a/examples/ckan/ckan_dataset.json +++ b/examples/ckan/ckan_dataset.json @@ -8,6 +8,8 @@ {"key": "guid", "value": "9df8df51-63db-37a8-e044-0003ba9b0d98"}, {"key": "dcat_publisher_name", "value": "Geological Society"}, {"key": "dcat_publisher_email", "value": "info@gs.org"}, + {"key": "dcat_creator_name", "value": "John Doe"}, + {"key": "dcat_creator_email", "value": "johndoe@example.com"}, {"key": "language", "value": "en,es,ca"} ], "resources": [{"id": "b1e0b666-b7f4-44c1-9b16-56c78e86b66a", diff --git a/examples/ckan/dataset.json b/examples/ckan/dataset.json index c1d887d1..ebcc1055 100644 --- a/examples/ckan/dataset.json +++ b/examples/ckan/dataset.json @@ -9,11 +9,17 @@ "keyword" : ["exploration", "geochemical-exploration", "geochemical-maps", "geochemistry", "geology", "nercddc", "regional-geology"], "publisher": { "name": "Geological Society", - "mbox": "info@gs.org" + "email": "info@gs.org" }, - "distribution": [{"accessURL": "http://www.bgs.ac.uk/gbase/geochemcd/home.html", - "byteSize": null, - "description": "Resource locator", - "format": "text/html", - "title": ""}] + "creator": { + "name": "John Doe", + "email": "johndoe@example.com" + }, + "distribution": [{ + "accessURL": "http://www.bgs.ac.uk/gbase/geochemcd/home.html", + "byteSize": null, + "description": "Resource locator", + "format": "text/html", + "title": "" + }] } diff --git a/examples/ckan/full_ckan_dataset.json b/examples/ckan/full_ckan_dataset.json index 24a17bcc..e62927c7 100644 --- a/examples/ckan/full_ckan_dataset.json +++ b/examples/ckan/full_ckan_dataset.json @@ -1,6 +1,6 @@ { - "author": null, - "author_email": null, + "author": "John Doe", + "author_email": "johndoe@example.com", "extras": [ { "__extras": {