From 1bce8344793ff92bb172d17f4ab4cc7383e18d11 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 29 May 2024 11:10:06 +0200 Subject: [PATCH] Fix merge errors --- ckanext/dcat/profiles/__init__.py | 1 + ckanext/dcat/profiles/base.py | 93 ++++++++- ckanext/dcat/profiles/euro_dcat_ap.py | 83 ++++---- ckanext/dcat/profiles/euro_dcat_ap_2.py | 188 +++++++++--------- .../dcat/profiles/euro_dcat_ap_scheming.py | 166 ++++++++++++++++ ckanext/dcat/tests/test_scheming_support.py | 3 - 6 files changed, 388 insertions(+), 146 deletions(-) create mode 100644 ckanext/dcat/profiles/euro_dcat_ap_scheming.py diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py index 92266c72..a80a48c6 100644 --- a/ckanext/dcat/profiles/__init__.py +++ b/ckanext/dcat/profiles/__init__.py @@ -20,4 +20,5 @@ from .euro_dcat_ap import EuropeanDCATAPProfile from .euro_dcat_ap_2 import EuropeanDCATAP2Profile +from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile from .schemaorg import SchemaOrgProfile diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 4b652b91..c91a1e8e 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -7,7 +7,7 @@ from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS from geomet import wkt, InvalidGeoJSONException -from ckantoolkit import config, url_for, asbool, get_action +from ckantoolkit import config, url_for, asbool, get_action, ObjectNotFound from ckan.model.license import LicenseRegister from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS @@ -41,7 +41,7 @@ "spdx": SPDX, } -PREFIX_MAILTO = u"mailto:" +PREFIX_MAILTO = "mailto:" GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json" @@ -105,11 +105,20 @@ class RDFProfile(object): custom profiles """ - def __init__(self, graph, compatibility_mode=False): - """Class constructor + _dataset_schema = None - Graph is an rdflib.Graph instance. + # Cache for mappings of licenses URL/title to ID built when needed in + # _license(). + _licenceregister_cache = None + # Cache for organization_show details (used for publisher fallback) + _org_cache: dict = {} + + def __init__(self, graph, dataset_type="dataset", compatibility_mode=False): + """Class constructor + Graph is an rdflib.Graph instance. + A scheming dataset type can be provided, in which case the scheming schema + will be loaded so it can be used by profiles. In compatibility mode, some fields are modified to maintain compatibility with previous versions of the ckanext-dcat parsers (eg adding the `dcat_` prefix or storing comma separated lists instead @@ -120,9 +129,17 @@ def __init__(self, graph, compatibility_mode=False): self.compatibility_mode = compatibility_mode - # Cache for mappings of licenses URL/title to ID built when needed in - # _license(). - self._licenceregister_cache = None + try: + schema_show = get_action("scheming_dataset_schema_show") + try: + schema = schema_show({}, {"type": dataset_type}) + except ObjectNotFound: + raise ObjectNotFound(f"Unknown dataset schema: {dataset_type}") + + self._dataset_schema = schema + + except KeyError: + pass def _datasets(self): """ @@ -707,6 +724,64 @@ def _add_spatial_to_dict(self, dataset_dict, key, spatial): } ) + def _schema_field(self, key): + """ + Returns the schema field information if the provided key exists as a field in + the dataset schema (if one was provided) + """ + if not self._dataset_schema: + return None + + for field in self._dataset_schema["dataset_fields"]: + if field["field_name"] == key: + return field + + def _schema_resource_field(self, key): + """ + Returns the schema field information if the provided key exists as a field in + the resources fields of the dataset schema (if one was provided) + """ + if not self._dataset_schema: + return None + + for field in self._dataset_schema["resource_fields"]: + if field["field_name"] == key: + return field + + def _set_dataset_value(self, dataset_dict, key, value): + """ + Sets the value for a given key in a CKAN dataset dict + If a dataset schema was provided, the schema will be checked to see if + a custom field is present for the key. If so the key will be stored at + the dict root level, otherwise it will be stored as an extra. + Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored + at the root level. + """ + if self._schema_field(key) or key in ROOT_DATASET_FIELDS: + dataset_dict[key] = value + else: + if not dataset_dict.get("extras"): + dataset_dict["extras"] = [] + dataset_dict["extras"].append({"key": key, "value": value}) + + return dataset_dict + + def _set_list_dataset_value(self, dataset_dict, key, value): + schema_field = self._schema_field(key) + if schema_field and "scheming_multiple_text" in schema_field["validators"]: + return self._set_dataset_value(dataset_dict, key, value) + else: + return self._set_dataset_value(dataset_dict, key, json.dumps(value)) + + def _set_list_resource_value(self, resource_dict, key, value): + schema_field = self._schema_resource_field(key) + if schema_field and "scheming_multiple_text" in schema_field["validators"]: + resource_dict[key] = value + else: + resource_dict[key] = json.dumps(value) + + return resource_dict + def _get_dataset_value(self, dataset_dict, key, default=None): """ Returns the value for the given key on a CKAN dict @@ -880,7 +955,7 @@ def _without_mailto(self, mail_addr): Ensures that the mail address string has no mailto: prefix. """ if mail_addr: - return str(mail_addr).replace(PREFIX_MAILTO, u"") + return str(mail_addr).replace(PREFIX_MAILTO, "") else: return mail_addr diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py index 9a4c853b..b7e4cae4 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_dcat_ap.py @@ -20,11 +20,9 @@ DCAT, DCT, ADMS, - XSD, VCARD, FOAF, SCHEMA, - SKOS, LOCN, GSP, OWL, @@ -354,51 +352,66 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) # Publisher - if any( + publisher_ref = None + + if dataset_dict.get("publisher"): + # Scheming publisher field: will be handled in a separate profile + pass + elif any( [ self._get_dataset_value(dataset_dict, "publisher_uri"), self._get_dataset_value(dataset_dict, "publisher_name"), - dataset_dict.get("organization"), ] ): - + # Legacy publisher_* extras publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) publisher_name = self._get_dataset_value(dataset_dict, "publisher_name") if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) + publisher_ref = CleanedURIRef(publisher_uri) else: # No publisher_uri - publisher_details = BNode() - - g.add((publisher_details, RDF.type, FOAF.Organization)) - g.add((dataset_ref, DCT.publisher, publisher_details)) - - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. - if ( - not publisher_name - and not publisher_uri - and dataset_dict.get("organization") - ): - publisher_name = dataset_dict["organization"]["title"] - - g.add((publisher_details, FOAF.name, Literal(publisher_name))) - # TODO: It would make sense to fallback these to organization - # fields but they are not in the default schema and the - # `organization` object in the dataset_dict does not include - # custom fields + publisher_ref = BNode() + publisher_details = { + "name": publisher_name, + "email": self._get_dataset_value(dataset_dict, "publisher_email"), + "url": self._get_dataset_value(dataset_dict, "publisher_url"), + "type": self._get_dataset_value(dataset_dict, "publisher_type"), + } + elif dataset_dict.get("organization"): + # Fall back to dataset org + org_id = dataset_dict["organization"]["id"] + org_dict = None + if org_id in self._org_cache: + org_dict = self._org_cache[org_id] + else: + try: + org_dict = toolkit.get_action("organization_show")( + {"ignore_auth": True}, {"id": org_id} + ) + self._org_cache[org_id] = org_dict + except toolkit.ObjectNotFound: + pass + if org_dict: + publisher_ref = CleanedURIRef( + publisher_uri_organization_fallback(dataset_dict) + ) + publisher_details = { + "name": org_dict.get("title"), + "email": org_dict.get("email"), + "url": org_dict.get("url"), + "type": org_dict.get("dcat_type"), + } + # Add to graph + if publisher_ref: + g.add((publisher_ref, RDF.type, FOAF.Organization)) + g.add((dataset_ref, DCT.publisher, publisher_ref)) items = [ - ("publisher_email", FOAF.mbox, None, Literal), - ("publisher_url", FOAF.homepage, None, URIRef), - ("publisher_type", DCT.type, None, URIRefOrLiteral), + ("name", FOAF.name, None, Literal), + ("email", FOAF.mbox, None, Literal), + ("url", FOAF.homepage, None, URIRef), + ("type", DCT.type, None, URIRefOrLiteral), ] - - self._add_triples_from_dict(dataset_dict, publisher_details, items) + self._add_triples_from_dict(publisher_details, publisher_ref, items) # Temporal start = self._get_dataset_value(dataset_dict, "temporal_start") diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 6f40e3ab..3db77d61 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -91,56 +91,52 @@ def parse_dataset(self, dataset_dict, dataset_ref): if values: resource_dict[key] = json.dumps(values) - # Access services - access_service_list = [] + # Access services + access_service_list = [] - for access_service in self.g.objects( - distribution, DCAT.accessService + for access_service in self.g.objects( + distribution, DCAT.accessService + ): + access_service_dict = {} + + # Simple values + for key, predicate in ( + ("availability", DCATAP.availability), + ("title", DCT.title), + ("endpoint_description", DCAT.endpointDescription), + ("license", DCT.license), + ("access_rights", DCT.accessRights), + ("description", DCT.description), + ): + value = self._object_value(access_service, predicate) + if value: + access_service_dict[key] = value + # List + for key, predicate in ( + ("endpoint_url", DCAT.endpointURL), + ("serves_dataset", DCAT.servesDataset), ): - access_service_dict = {} - - # Simple values - for key, predicate in ( - ("availability", DCATAP.availability), - ("title", DCT.title), - ("endpoint_description", DCAT.endpointDescription), - ("license", DCT.license), - ("access_rights", DCT.accessRights), - ("description", DCT.description), - ): - value = self._object_value(access_service, predicate) - if value: - access_service_dict[key] = value - # List - for key, predicate in ( - ("endpoint_url", DCAT.endpointURL), - ("serves_dataset", DCAT.servesDataset), - ): - values = self._object_value_list( - access_service, predicate - ) - if values: - access_service_dict[key] = values - - # Access service URI (explicitly show the missing ones) - access_service_dict["uri"] = ( - str(access_service) - if isinstance(access_service, URIRef) - else "" - ) - - # Remember the (internal) access service reference for referencing in - # further profiles, e.g. for adding more properties - access_service_dict["access_service_ref"] = str( - access_service - ) - - access_service_list.append(access_service_dict) - - if access_service_list: - resource_dict["access_services"] = json.dumps( - access_service_list - ) + values = self._object_value_list(access_service, predicate) + if values: + access_service_dict[key] = values + + # Access service URI (explicitly show the missing ones) + access_service_dict["uri"] = ( + str(access_service) + if isinstance(access_service, URIRef) + else "" + ) + + # Remember the (internal) access service reference for referencing in + # further profiles, e.g. for adding more properties + access_service_dict["access_service_ref"] = str(access_service) + + access_service_list.append(access_service_dict) + + if access_service_list: + resource_dict["access_services"] = json.dumps( + access_service_list + ) return dataset_dict @@ -253,60 +249,54 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(resource_dict, distribution, items) - try: - access_service_list = json.loads( - resource_dict.get("access_services", "[]") + # Access services + access_service_list = resource_dict.get("access_services", []) + if isinstance(access_service_list, str): + try: + access_service_list = json.loads(access_service_list) + except ValueError: + access_service_list = [] + + for access_service_dict in access_service_list: + + access_service_uri = access_service_dict.get("uri") + if access_service_uri: + access_service_node = CleanedURIRef(access_service_uri) + else: + access_service_node = BNode() + # Remember the (internal) access service reference for referencing in + # further profiles + access_service_dict["access_service_ref"] = str(access_service_node) + + self.g.add((distribution, DCAT.accessService, access_service_node)) + + self.g.add((access_service_node, RDF.type, DCAT.DataService)) + + # Simple values + items = [ + ("availability", DCATAP.availability, None, URIRefOrLiteral), + ("license", DCT.license, None, URIRefOrLiteral), + ("access_rights", DCT.accessRights, None, URIRefOrLiteral), + ("title", DCT.title, None, Literal), + ("endpoint_description", DCAT.endpointDescription, None, Literal), + ("description", DCT.description, None, Literal), + ] + + self._add_triples_from_dict( + access_service_dict, access_service_node, items ) - # Access service - for access_service_dict in access_service_list: - - access_service_uri = access_service_dict.get("uri") - if access_service_uri: - access_service_node = CleanedURIRef(access_service_uri) - else: - access_service_node = BNode() - # Remember the (internal) access service reference for referencing in - # further profiles - access_service_dict["access_service_ref"] = str( - access_service_node - ) - - self.g.add((distribution, DCAT.accessService, access_service_node)) - - self.g.add((access_service_node, RDF.type, DCAT.DataService)) - - # Simple values - items = [ - ("availability", DCATAP.availability, None, URIRefOrLiteral), - ("license", DCT.license, None, URIRefOrLiteral), - ("access_rights", DCT.accessRights, None, URIRefOrLiteral), - ("title", DCT.title, None, Literal), - ( - "endpoint_description", - DCAT.endpointDescription, - None, - Literal, - ), - ("description", DCT.description, None, Literal), - ] - - self._add_triples_from_dict( - access_service_dict, access_service_node, items - ) - # Lists - items = [ - ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral), - ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral), - ] - self._add_list_triples_from_dict( - access_service_dict, access_service_node, items - ) + # Lists + items = [ + ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral), + ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict( + access_service_dict, access_service_node, items + ) - if access_service_list: - resource_dict["access_services"] = json.dumps(access_service_list) - except ValueError: - pass + if access_service_list: + resource_dict["access_services"] = json.dumps(access_service_list) def graph_from_catalog(self, catalog_dict, catalog_ref): diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py new file mode 100644 index 00000000..6bd570a9 --- /dev/null +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -0,0 +1,166 @@ +import json + +from rdflib import URIRef, BNode +from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral +from .base import ( + RDF, + XSD, + DCAT, + DCT, + VCARD, + FOAF, +) + + +class EuropeanDCATAPSchemingProfile(RDFProfile): + """ + This is a compatibilty profile meant to add support for ckanext-scheming to the existing + `euro_dcat_ap` and `euro_dcat_ap_2` profiles. + It does not add or remove any properties from these profiles, it just transforms the + resulting dataset_dict so it is compatible with a ckanext-scheming schema + TODO: summarize changes and link to docs + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + + if not self._dataset_schema: + # Not using scheming + return dataset_dict + + # Move extras to root + + extras_to_remove = [] + extras = dataset_dict.get("extras", []) + for extra in extras: + if self._schema_field(extra["key"]): + # This is a field defined in the dataset schema + dataset_dict[extra["key"]] = extra["value"] + extras_to_remove.append(extra["key"]) + + dataset_dict["extras"] = [e for e in extras if e["key"] not in extras_to_remove] + + # Parse lists + def _parse_list_value(data_dict, field_name): + schema_field = self._schema_field( + field_name + ) or self._schema_resource_field(field_name) + + if schema_field and "scheming_multiple_text" in schema_field.get( + "validators", [] + ): + if isinstance(data_dict[field_name], str): + try: + data_dict[field_name] = json.loads(data_dict[field_name]) + except ValueError: + pass + + for field_name in dataset_dict.keys(): + _parse_list_value(dataset_dict, field_name) + + for resource_dict in dataset_dict.get("resources", []): + for field_name in resource_dict.keys(): + _parse_list_value(resource_dict, field_name) + + # Repeating subfields + for schema_field in self._dataset_schema["dataset_fields"]: + if "repeating_subfields" in schema_field: + # Check if existing extras need to be migrated + field_name = schema_field["field_name"] + new_extras = [] + new_dict = {} + for extra in dataset_dict.get("extras", []): + if extra["key"].startswith(f"{field_name}_"): + subfield = extra["key"][extra["key"].index("_") + 1 :] + if subfield in [ + f["field_name"] for f in schema_field["repeating_subfields"] + ]: + new_dict[subfield] = extra["value"] + else: + new_extras.append(extra) + else: + new_extras.append(extra) + if new_dict: + dataset_dict[field_name] = [new_dict] + dataset_dict["extras"] = new_extras + + for schema_field in self._dataset_schema["resource_fields"]: + if "repeating_subfields" in schema_field: + # Check if value needs to be load from JSON + field_name = schema_field["field_name"] + for resource_dict in dataset_dict.get("resources", []): + if resource_dict.get(field_name) and isinstance( + resource_dict[field_name], str + ): + try: + # TODO: load only subfields in schema? + resource_dict[field_name] = json.loads( + resource_dict[field_name] + ) + except ValueError: + pass + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + contact = dataset_dict.get("contact") + if isinstance(contact, list) and len(contact): + for item in contact: + contact_uri = item.get("uri") + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + self.g.add((contact_details, RDF.type, VCARD.Organization)) + self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict(item, contact_details, VCARD.fn, "name") + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + item, + contact_details, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + + publisher = dataset_dict.get("publisher") + if isinstance(publisher, list) and len(publisher): + publisher = publisher[0] + publisher_uri = publisher.get("uri") + if publisher_uri: + publisher_ref = CleanedURIRef(publisher_uri) + else: + publisher_ref = BNode() + + self.g.add((publisher_ref, RDF.type, FOAF.Organization)) + self.g.add((dataset_ref, DCT.publisher, publisher_ref)) + + self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name") + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.homepage, "url", URIRef + ) + self._add_triple_from_dict( + publisher, publisher_ref, DCT.type, "type", URIRefOrLiteral + ) + self._add_triple_from_dict( + publisher, + publisher_ref, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + + resources = dataset_dict.get("resources", []) + for resource in resources: + if resource.get("access_services"): + if isinstance(resource["access_services"], str): + try: + resource["access_services"] = json.loads( + resource["access_services"] + ) + except ValueError: + pass diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index ffa682b7..e2508e0f 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -20,9 +20,6 @@ LOCN, GSP, OWL, - SPDX, - GEOJSON_IMT, - DISTRIBUTION_LICENSE_FALLBACK_CONFIG, ) from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest