From 65abb1f7f535b27792ff2a8cd62f8b82aecb2603 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 8 May 2024 22:18:43 +0200 Subject: [PATCH 01/52] [#56] Allow to provide a dataset schema to profiles This allows to check if a field should be stored as a custom field or an extra --- ckanext/dcat/processors.py | 15 ++++++--- ckanext/dcat/profiles.py | 62 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index e6093443..c45c5d1c 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -33,7 +33,7 @@ class RDFProcessor(object): - def __init__(self, profiles=None, compatibility_mode=False): + def __init__(self, profiles=None, dataset_schema='dataset', compatibility_mode=False): ''' Creates a parser or serializer instance @@ -56,6 +56,8 @@ def __init__(self, profiles=None, compatibility_mode=False): raise RDFProfileException( 'No suitable RDF profiles could be loaded') + self.dataset_schema = dataset_schema + if not compatibility_mode: compatibility_mode = p.toolkit.asbool( config.get(COMPAT_MODE_CONFIG_OPTION, False)) @@ -177,11 +179,16 @@ def datasets(self): for dataset_ref in self._datasets(): dataset_dict = {} for profile_class in self._profiles: - profile = profile_class(self.g, self.compatibility_mode) + profile = profile_class( + self.g, + dataset_schema=self.dataset_schema, + compatibility_mode=self.compatibility_mode + ) profile.parse_dataset(dataset_dict, dataset_ref) yield dataset_dict + class RDFSerializer(RDFProcessor): ''' A CKAN to RDF serializer based on rdflib @@ -245,7 +252,7 @@ def graph_from_dataset(self, dataset_dict): dataset_ref = URIRef(dataset_uri(dataset_dict)) for profile_class in self._profiles: - profile = profile_class(self.g, self.compatibility_mode) + profile = profile_class(self.g, compatibility_mode=self.compatibility_mode) profile.graph_from_dataset(dataset_dict, dataset_ref) return dataset_ref @@ -263,7 +270,7 @@ def graph_from_catalog(self, catalog_dict=None): catalog_ref = URIRef(catalog_uri()) for profile_class in self._profiles: - profile = profile_class(self.g, self.compatibility_mode) + profile = profile_class(self.g, compatibility_mode=self.compatibility_mode) profile.graph_from_catalog(catalog_dict, catalog_ref) return catalog_ref diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index c1ced34f..efe62dca 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -5,6 +5,7 @@ from dateutil.parser import parse as parse_date +import ckantoolkit as toolkit from ckantoolkit import config from ckantoolkit import url_for @@ -15,7 +16,6 @@ from geomet import wkt, InvalidGeoJSONException from ckan.model.license import LicenseRegister -from ckan.plugins import toolkit from ckan.lib.munge import munge_tag from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS @@ -55,6 +55,19 @@ DISTRIBUTION_LICENSE_FALLBACK_CONFIG = 'ckanext.dcat.resource.inherit.license' +ROOT_DATASET_FIELDS = [ + 'name', + 'title', + 'url', + 'version', + 'tags', + 'license_id', + 'maintainer', + 'maintainer_email', + 'author', + 'author_email', +] + class URIRefOrLiteral(object): '''Helper which creates an URIRef if the value appears to be an http URL, @@ -111,7 +124,9 @@ class RDFProfile(object): custom profiles ''' - def __init__(self, graph, compatibility_mode=False): + _dataset_schema = None + + def __init__(self, graph, dataset_schema='dataset', compatibility_mode=False): '''Class constructor Graph is an rdflib.Graph instance. @@ -130,6 +145,15 @@ def __init__(self, graph, compatibility_mode=False): # _license(). self._licenceregister_cache = None + schema_show = toolkit.get_action("scheming_dataset_schema_show") + if schema_show: + try: + schema = schema_show({}, {"type": dataset_schema}) + except toolkit.ObjectNotFound: + raise toolkit.ObjectNotFound(f"Unknown dataset schema: {dataset_schema}") + + self._dataset_schema = schema + def _datasets(self): ''' Generator that returns all DCAT datasets on the graph @@ -695,6 +719,38 @@ def _add_spatial_to_dict(self, dataset_dict, key, spatial): {'key': 'spatial_{0}'.format(key) if key != 'geom' else 'spatial', 'value': spatial.get(key)}) + def _schema_field(self, key): + ''' + Returns the schema field information if the provided key exists as a field in + the dataset schema (if one was provided) + ''' + if not self._dataset_schema: + return None + + for field in self._dataset_schema['dataset_fields']: + if field['field_name'] == key: + return field + + def _set_dataset_value(self, dataset_dict, key, value): + ''' + Sets the value for a given key in a CKAN dataset dict + + If a dataset schema was provided, the schema will be checked to see if + a custom field is present for the key. If so the key will be stored at + the dict root level, otherwise it will be stored as an extra. + + Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored + at the root level. + ''' + if self._schema_field(key) or key in ROOT_DATASET_FIELDS: + dataset_dict[key] = value + else: + if not dataset_dict.get('extras'): + dataset_dict['extras'] = [] + dataset_dict['extras'].append({'key': key, 'value': value}) + + return dataset_dict + def _get_dataset_value(self, dataset_dict, key, default=None): ''' Returns the value for the given key on a CKAN dict @@ -1021,7 +1077,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): ): value = self._object_value(dataset_ref, predicate) if value: - dataset_dict['extras'].append({'key': key, 'value': value}) + self._set_dataset_value(dataset_dict, key, value) # Lists for key, predicate, in ( From 9faf5f5428f3e23a8824ea8836536bae67e1ccb2 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 8 May 2024 22:20:18 +0200 Subject: [PATCH 02/52] [#56] Handle list values --- ckanext/dcat/profiles.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index efe62dca..e5abab45 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -751,6 +751,13 @@ def _set_dataset_value(self, dataset_dict, key, value): return dataset_dict + def _set_list_dataset_value(self, dataset_dict, key, value): + schema_field = self._schema_field(key) + if schema_field and 'scheming_multiple_text' in schema_field['validators']: + return self._set_dataset_value(dataset_dict, key, value) + else: + return self._set_dataset_value(dataset_dict, key, json.dumps(value)) + def _get_dataset_value(self, dataset_dict, key, default=None): ''' Returns the value for the given key on a CKAN dict @@ -1094,8 +1101,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): ): values = self._object_value_list(dataset_ref, predicate) if values: - dataset_dict['extras'].append({'key': key, - 'value': json.dumps(values)}) + self._set_list_dataset_value(dataset_dict, key, values) # Contact details contact = self._contact_details(dataset_ref, DCAT.contactPoint) From a808f7230279e1c1f04b5800856a7d90ef56e50f Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 8 May 2024 22:20:44 +0200 Subject: [PATCH 03/52] [#56] Handle repeating subfields --- ckanext/dcat/plugins/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 617e8d4b..4a40d88b 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -132,6 +132,26 @@ def set_titles(object_dict): return data_dict + def before_dataset_index(self, dataset_dict): + schema = None + schema_show = toolkit.get_action("scheming_dataset_schema_show") + if schema_show: + try: + schema = schema_show({}, {"type": dataset_dict["type"]}) + except toolkit.ObjectNotFound: + pass + + if schema: + for field in schemas[dataset_dict['type']]['dataset_fields']: + if field['field_name'] not in dataset_dict and 'repeating_subfields' in field: + for key in dataset_dict[field['field_name']]: + # Index a flattened version + new_key = f'{field["field_name"]}_{key}' + dataset_dict[new_key] = dataset_dict[field['field_name']][key] + dataset_dict.pop(field['field_name'], None) + + return dataset_dict + class DCATJSONInterface(p.SingletonPlugin): p.implements(p.IActions) From d0b219e338dfdc9aff2cf75855d4f9b49f45ab79 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 8 May 2024 22:21:01 +0200 Subject: [PATCH 04/52] [#56] Add draft schema --- ckanext/dcat/schemas/__init__.py | 0 ckanext/dcat/schemas/dcat_ap_2.1.yaml | 116 ++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 ckanext/dcat/schemas/__init__.py create mode 100644 ckanext/dcat/schemas/dcat_ap_2.1.yaml diff --git a/ckanext/dcat/schemas/__init__.py b/ckanext/dcat/schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml new file mode 100644 index 00000000..aff9ecce --- /dev/null +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -0,0 +1,116 @@ +scheming_version: 2 +dataset_type: dataset +about: A reimplementation of the default CKAN dataset schema +about_url: http://github.com/ckan/ckanext-dcat + + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + form_placeholder: eg. A descriptive title + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + form_placeholder: eg. Some useful notes about the data + +- field_name: tag_string + label: Tags + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/ + +- field_name: owner_org + label: Organization + preset: dataset_organization + +- field_name: url + label: Source + form_placeholder: http://example.com/dataset.json + display_property: foaf:homepage + display_snippet: link.html + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + form_placeholder: '1.0' + +- field_name: author + label: Author + form_placeholder: Joe Bloggs + display_property: dc:creator + +- field_name: author_email + label: Author Email + form_placeholder: joe@example.com + display_property: dc:creator + display_snippet: email.html + display_email_name_field: author + +- field_name: maintainer + label: Maintainer + form_placeholder: Joe Bloggs + display_property: dc:contributor + +- field_name: maintainer_email + label: Maintainer Email + form_placeholder: joe@example.com + display_property: dc:contributor + display_snippet: email.html + display_email_name_field: maintainer + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + +- field_name: version_notes + label: Version notes + validators: ignore_missing unicode_safe + form_snippet: markdown.html + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + + +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: eg. January 2011 Gold Prices + +- field_name: description + label: Description + form_snippet: markdown.html + form_placeholder: Some useful notes about the data + +- field_name: format + label: Format + preset: resource_format_autocomplete From 7ee354ac31cefefd2b2995a01c8054c48bfb82fb Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 8 May 2024 22:21:16 +0200 Subject: [PATCH 05/52] [#56] Add some examples --- examples/dataset_gob_es.ttl | 52 +++++++ examples/dataset_gov_de.rdf | 263 ++++++++++++++++++++++++++++++++++++ 2 files changed, 315 insertions(+) create mode 100644 examples/dataset_gob_es.ttl create mode 100644 examples/dataset_gov_de.rdf diff --git a/examples/dataset_gob_es.ttl b/examples/dataset_gob_es.ttl new file mode 100644 index 00000000..70742f62 --- /dev/null +++ b/examples/dataset_gob_es.ttl @@ -0,0 +1,52 @@ +@prefix adms: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix gsp: . +@prefix locn: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix schema: . +@prefix skos: . +@prefix time: . +@prefix vcard: . +@prefix xml: . +@prefix xsd: . + + a dcat:Dataset ; + dct:accrualPeriodicity ; + dct:description "Estudio de satisfacción de las personas directivas de los centros educativos con las escuelas municipales de promoción deportiva en horario extraescolar, organizadas desde la Dirección General de Deporte en colaboración con las diferentes federaciones deportivas de las diferentes modalidades deportivas, mediante el análisis de los resultados obtenidos al realizar una encuesta enviada al personal directivo de cada centro escolar a su correo electrónico invitándoles a participar. Las Escuelas Municipales de Promoción Deportiva en centros escolares tienen como finalidad fomentar una práctica deportiva estable y continuada entre los escolares de diferentes modalidades deportivas y la posterior iniciación a la competición con la participación de las escuelas en los Juegos Deportivos Municipales. El objeto de este estudio es conocer el grado de satisfacción del personal directivo con las Escuelas Municipales de Promoción Deportiva (EMPD) impartidas en los centros escolares de la ciudad de Madrid, mediante la realización de diversas preguntas relacionadas con los servicios ofrecidos (organización, instalaciones y profesorado). De esta manera se pueden detectar ámbitos o actuaciones que precisen intervenciones de mejora para alcanzar el objetivo general del Plan de Calidad establecido en el Ayuntamiento de Madrid: garantizar la calidad de los servicios prestados a la ciudadanía y su mejora continua, logrando la satisfacción ciudadana y alcanzando una gestión pública cada vez más eficaz y eficiente, participativa y transparente."@es ; + dct:identifier "https://datos.madrid.es/egob/catalogo/300676-0-deporte-encuesta-escuelas" ; + dct:issued "2024-04-15T09:08:02+02:00"^^xsd:dateTime ; + dct:language "es" ; + dct:license ; + dct:modified "2024-04-15T09:08:10+02:00"^^xsd:dateTime ; + dct:publisher ; + dct:title "Estudio de Satisfacción con el Programa de Escuelas Municipales de Promoción Deportiva en centros escolares"@es ; + dcat:distribution ; + dcat:keyword "deporte escolar"@es, + "escuelas"@es, + "promoción"@es ; + dcat:theme . + + a skos:Concept ; + skos:notation "L01280796" ; + skos:prefLabel "Ayuntamiento de Madrid" . + + a time:DurationDescription ; + time:years 1.0 . + + a dct:Frequency ; + rdf:value . + + a dcat:Distribution ; + dct:format ; + dct:title "Estudio de satisfacción temporada 2023-2024"@es ; + dcat:accessURL "https://datos.madrid.es/egob/catalogo/300676-0-deporte-encuesta-escuelas.xlsx" ; + dcat:byteSize 43008.0 . + + a dct:IMT ; + rdfs:label "XLS" ; + rdf:value "application/vnd.ms-excel" . + diff --git a/examples/dataset_gov_de.rdf b/examples/dataset_gov_de.rdf new file mode 100644 index 00000000..f2640e94 --- /dev/null +++ b/examples/dataset_gov_de.rdf @@ -0,0 +1,263 @@ + + + + Liefer- und Abholservices (Gastronomie) + Dieser Datensatz umfasst die Standorte der Liefer- und Abholservices (Gastronomie) in der Hanse- und Universitätsstadt Rostock mit Informationen zu Adresse, Art, Bezeichnung, Barrierefreiheit, Öffnungszeiten und Kontaktdaten. Die Ressourcen werden nur bei Bedarf aktualisiert. + + c1be4007-d811-48fb-8818-2cb358b06a63 + + gastronomiebetriebe + handel + handel-und-verbrauch + lebensmittel + nahrung + nahrungsmittelgewerbe + wirtschaft + ökonomie + 2020-04-21T11:59:39.642528 + 2024-05-08T04:11:24.309486 + + + http://dcat-ap.de/def/dcatde/ + + + Hanse- und Universitätsstadt Rostock – Kataster-, Vermessungs- und Liegenschaftsamt + + + + + + Hanse- und Universitätsstadt Rostock + + + + + {"type":"Polygon","coordinates":[[[11.9936, 54.0486], [11.9936, 54.2466], [12.3045, 54.2466], [12.3045, 54.0486], [11.9936, 54.0486]]]} + POLYGON ((11.9936 54.0486, 11.9936 54.2466, 12.3045 54.2466, 12.3045 54.0486, 11.9936 54.0486)) + + + + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + + http://dcat-ap.de/def/dcatde/ + + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + 147253.0 + + + 1d5a3926327046c8fcd75652c6e44d3c74004c6727939f33fb05e16e324df241 + + + + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + http://dcat-ap.de/def/dcatde/ + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + http://dcat-ap.de/def/dcatde/ + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + + + + + + + Liefer- und Abholservices (Gastronomie) + Diese Daten umfassen die Standorte der Liefer- und Abholservices (Gastronomie) in der Hanse- und Universitätsstadt Rostock mit Informationen zu Adresse, Art, Bezeichnung, Öffnungszeiten und Kontaktdaten. + + + + + + http://dcat-ap.de/def/dcatde/ + 2023-02-08T12:25:41 + 2024-05-07T11:58:19 + + + + + + + Liefer- und Abholservices (Gastronomie) + Diese Karte umfasst die Standorte der Liefer- und Abholservices (Gastronomie) in der Hanse- und Universitätsstadt Rostock. + + + + + + http://dcat-ap.de/def/dcatde/ + 2023-02-08T12:25:41 + 2024-05-07T11:58:19 + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + + http://dcat-ap.de/def/dcatde/ + + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + 27256.0 + + + f2f17b94406047f148bfa41fb5945eb9f2023d6661a3806031baba519de7ba73 + + + + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + + http://dcat-ap.de/def/dcatde/ + + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + 60744.0 + + + 7cc6d258c20d5b5b507c8466900a381de8a349bb1a8118361c1a613e1b59fb14 + + + + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + + http://dcat-ap.de/def/dcatde/ + + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + 308724.0 + + + a407dd69e91861d98589ea9b14c39aa6972d56c2ce52d2dd3f0e0be8922fc6b7 + + + + + + + + + + Liefer- und Abholservices (Gastronomie) + + + + + + + http://dcat-ap.de/def/dcatde/ + + + 2020-04-21T11:59:53 + 2024-05-07T11:58:19 + 241844.0 + + + 788e8c33012ee44fbbfa1589277579fdc34211a6326962624789e182680307f2 + + + + + + + + + + + + + Rostock + + + Hanse- und Universitätsstadt Rostock – Kataster-, Vermessungs- und Liegenschaftsamt + geodienste@rostock.de + + + + + Hanse- und Universitätsstadt Rostock – Kataster-, Vermessungs- und Liegenschaftsamt + geodienste@rostock.de + + + + From 9b847e99c68d0790cbf0e0861e2b4af2cf9ea2f3 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 9 May 2024 13:04:41 +0200 Subject: [PATCH 06/52] [#56] Fix repeating subfields index logic --- ckanext/dcat/plugins/__init__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 4a40d88b..6ff59f14 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -134,20 +134,22 @@ def set_titles(object_dict): def before_dataset_index(self, dataset_dict): schema = None - schema_show = toolkit.get_action("scheming_dataset_schema_show") + schema_show = p.toolkit.get_action("scheming_dataset_schema_show") if schema_show: try: schema = schema_show({}, {"type": dataset_dict["type"]}) - except toolkit.ObjectNotFound: + except p.toolkit.ObjectNotFound: pass if schema: - for field in schemas[dataset_dict['type']]['dataset_fields']: - if field['field_name'] not in dataset_dict and 'repeating_subfields' in field: - for key in dataset_dict[field['field_name']]: - # Index a flattened version - new_key = f'{field["field_name"]}_{key}' - dataset_dict[new_key] = dataset_dict[field['field_name']][key] + for field in schema['dataset_fields']: + if field['field_name'] in dataset_dict and 'repeating_subfields' in field: + for index, item in enumerate(dataset_dict[field['field_name']]): + for key in item: + # Index a flattened version + new_key = f'{field["field_name"]}_{index}_{key}' + + dataset_dict[new_key] = dataset_dict[field['field_name']][index][key] dataset_dict.pop(field['field_name'], None) return dataset_dict From e6583aad52f44deb229277d03cb4f74bcd9bc5ba Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 9 May 2024 16:24:02 +0200 Subject: [PATCH 07/52] [#56] [#56] Initial e2e scheming support test --- ckanext/dcat/tests/test_scheming_support.py | 94 +++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 ckanext/dcat/tests/test_scheming_support.py diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py new file mode 100644 index 00000000..cbba044e --- /dev/null +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -0,0 +1,94 @@ +import pytest + +from rdflib.namespace import RDF + +from ckan.tests.helpers import call_action + +from ckanext.dcat import utils +from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.profiles import ( + DCAT, + DCT, + ADMS, + XSD, + VCARD, + FOAF, + SCHEMA, + SKOS, + LOCN, + GSP, + OWL, + SPDX, + GEOJSON_IMT, + DISTRIBUTION_LICENSE_FALLBACK_CONFIG, +) +from ckanext.dcat.tests.utils import BaseSerializeTest + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" +) +@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +class TestSchemingSupport(BaseSerializeTest): + def test_e2e_ckan_to_dcat(self): + """ + Create a dataset using the scheming schema, check that fields + are exposed in the DCAT RDF graph + """ + + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Lorem ipsum", + "url": "http://example.org/ds1", + "version": "1.0b", + "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], + # Standard fields + "version_notes": "Some version notes", + # List fields (lists) + "conforms_to": ["Standard 1", "Standard 2"], + # Repeating subfields + "contact": [ + {"name": "Contact 1", "email": "contact1@example.org"}, + {"name": "Contact 2", "email": "contact2@example.org"}, + ], + # TODO: resources + } + + dataset = call_action("package_create", **dataset_dict) + + # Make sure schema was used + assert dataset["conforms_to"][0] == "Standard 1" + assert dataset["contact"][0]["name"] == "Contact 1" + + s = RDFSerializer(profiles=["euro_dcat_ap"]) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert str(dataset_ref) == utils.dataset_uri(dataset) + + # Core fields + assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset) + assert self._triple(g, dataset_ref, DCT.title, dataset["title"]) + assert self._triple(g, dataset_ref, DCT.description, dataset["notes"]) + + # Standard fields + assert self._triple(g, dataset_ref, ADMS.versionNotes, dataset["version_notes"]) + + # List fields + # TODO helper function + conforms = [t for t in g.triples((dataset_ref, DCT.conformsTo, None))] + assert len(conforms) == len(dataset["conforms_to"]) + for index, item in enumerate(conforms): + assert str(item[2]) == dataset["conforms_to"][index] + + # Repeating subfields + + contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] + + # TODO this will fail + assert len(contact_details) == len(dataset["contact"]) From d86f46783b168e26acd4238995258cb00fa5fc39 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 14 May 2024 14:44:25 +0200 Subject: [PATCH 08/52] [#56] Serialize repeating subfields --- ckanext/dcat/profiles.py | 26 +++++++++++++++++++++ ckanext/dcat/tests/test_scheming_support.py | 5 +++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index e5abab45..26001285 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -1336,6 +1336,32 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): _type=URIRef, value_modifier=self._add_mailto ) + # TODO: this will go into a separate profile + contact = dataset_dict.get("contact") + if isinstance(contact, list) and len(contact): + for item in contact: + contact_uri = item.get('uri') + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + g.add((contact_details, RDF.type, VCARD.Organization)) + g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict( + item, contact_details, + VCARD.fn, 'name' + ) + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + item, contact_details, + VCARD.hasEmail, 'email', + _type=URIRef, value_modifier=self._add_mailto + ) + + + # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index cbba044e..3769f361 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -90,5 +90,8 @@ def test_e2e_ckan_to_dcat(self): contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] - # TODO this will fail assert len(contact_details) == len(dataset["contact"]) + self._triple(g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"]) + self._triple(g, contact_details[0][2], VCARD.hasEmail, dataset_dict["contact"][0]["email"]) + self._triple(g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"]) + self._triple(g, contact_details[1][2], VCARD.hasEmail, dataset_dict["contact"][1]["email"]) From 000baa46273a853c92290b8885b9f45a73653cfc Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 15 May 2024 16:02:09 +0200 Subject: [PATCH 09/52] [#56] Add sample of resource fields --- ckanext/dcat/profiles.py | 86 ++++++++++---------- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 25 ++++++ ckanext/dcat/tests/test_scheming_support.py | 88 ++++++++++++++++++--- 3 files changed, 149 insertions(+), 50 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 26001285..02bd395c 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -1752,47 +1752,53 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(resource_dict, distribution, items) - try: - access_service_list = json.loads(resource_dict.get('access_services', '[]')) - # Access service - for access_service_dict in access_service_list: - - access_service_uri = access_service_dict.get('uri') - if access_service_uri: - access_service_node = CleanedURIRef(access_service_uri) - else: - access_service_node = BNode() - # Remember the (internal) access service reference for referencing in - # further profiles - access_service_dict['access_service_ref'] = str(access_service_node) - - self.g.add((distribution, DCAT.accessService, access_service_node)) - - self.g.add((access_service_node, RDF.type, DCAT.DataService)) - - # Simple values - items = [ - ('availability', DCATAP.availability, None, URIRefOrLiteral), - ('license', DCT.license, None, URIRefOrLiteral), - ('access_rights', DCT.accessRights, None, URIRefOrLiteral), - ('title', DCT.title, None, Literal), - ('endpoint_description', DCAT.endpointDescription, None, Literal), - ('description', DCT.description, None, Literal), - ] - - self._add_triples_from_dict(access_service_dict, access_service_node, items) + # TODO: this will go into a separate profile - # Lists - items = [ - ('endpoint_url', DCAT.endpointURL, None, URIRefOrLiteral), - ('serves_dataset', DCAT.servesDataset, None, URIRefOrLiteral), - ] - self._add_list_triples_from_dict(access_service_dict, access_service_node, items) - - if access_service_list: - resource_dict['access_services'] = json.dumps(access_service_list) - except ValueError: - pass + access_service_list = resource_dict.get('access_services', []) + if isinstance(access_service_list, str): + try: + access_service_list = json.loads(access_service_list) + except ValueError: + access_service_list = [] + + # Access service + for access_service_dict in access_service_list: + + access_service_uri = access_service_dict.get('uri') + if access_service_uri: + access_service_node = CleanedURIRef(access_service_uri) + else: + access_service_node = BNode() + # Remember the (internal) access service reference for referencing in + # further profiles + access_service_dict['access_service_ref'] = str(access_service_node) + + self.g.add((distribution, DCAT.accessService, access_service_node)) + + self.g.add((access_service_node, RDF.type, DCAT.DataService)) + + # Simple values + items = [ + ('availability', DCATAP.availability, None, URIRefOrLiteral), + ('license', DCT.license, None, URIRefOrLiteral), + ('access_rights', DCT.accessRights, None, URIRefOrLiteral), + ('title', DCT.title, None, Literal), + ('endpoint_description', DCAT.endpointDescription, None, Literal), + ('description', DCT.description, None, Literal), + ] + + self._add_triples_from_dict(access_service_dict, access_service_node, items) + + # Lists + items = [ + ('endpoint_url', DCAT.endpointURL, None, URIRefOrLiteral), + ('serves_dataset', DCAT.servesDataset, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(access_service_dict, access_service_node, items) + + # TODO: re-enable when separating into a profile + # if access_service_list: + # resource_dict['access_services'] = json.dumps(access_service_list) def graph_from_catalog(self, catalog_dict, catalog_ref): diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index aff9ecce..20edc599 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -94,6 +94,7 @@ dataset_fields: - field_name: conforms_to label: Conforms to preset: multiple_text + validators: ignore_missing scheming_multiple_text resource_fields: @@ -114,3 +115,27 @@ resource_fields: - field_name: format label: Format preset: resource_format_autocomplete + +- field_name: rights + label: Rights + form_snippet: markdown.html + form_placeholder: Some statement about the rights associated with the resource + +- field_name: language + label: Language + preset: multiple_text + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 3769f361..aa49551a 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -55,7 +55,24 @@ def test_e2e_ckan_to_dcat(self): {"name": "Contact 1", "email": "contact1@example.org"}, {"name": "Contact 2", "email": "contact2@example.org"}, ], - # TODO: resources + "resources": [ + { + "name": "Resource 1", + "url": "https://example.com/data.csv", + "format": "CSV", + "rights": "Some stament about rights", + "language": ["en", "ca", "es"], + "access_services": [ + { + "title": "Access Service 1", + "endpoint_url": [ + "https://example.org/access_service/1", + "https://example.org/access_service/2", + ], + } + ], + } + ], } dataset = call_action("package_create", **dataset_dict) @@ -64,7 +81,7 @@ def test_e2e_ckan_to_dcat(self): assert dataset["conforms_to"][0] == "Standard 1" assert dataset["contact"][0]["name"] == "Contact 1" - s = RDFSerializer(profiles=["euro_dcat_ap"]) + s = RDFSerializer(profiles=["euro_dcat_ap_2"]) g = s.g dataset_ref = s.graph_from_dataset(dataset) @@ -81,17 +98,68 @@ def test_e2e_ckan_to_dcat(self): # List fields # TODO helper function - conforms = [t for t in g.triples((dataset_ref, DCT.conformsTo, None))] - assert len(conforms) == len(dataset["conforms_to"]) - for index, item in enumerate(conforms): - assert str(item[2]) == dataset["conforms_to"][index] + conforms_to = [ + str(t[2]) for t in g.triples((dataset_ref, DCT.conformsTo, None)) + ] + assert conforms_to == dataset["conforms_to"] # Repeating subfields contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] assert len(contact_details) == len(dataset["contact"]) - self._triple(g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"]) - self._triple(g, contact_details[0][2], VCARD.hasEmail, dataset_dict["contact"][0]["email"]) - self._triple(g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"]) - self._triple(g, contact_details[1][2], VCARD.hasEmail, dataset_dict["contact"][1]["email"]) + self._triple( + g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"] + ) + self._triple( + g, + contact_details[0][2], + VCARD.hasEmail, + dataset_dict["contact"][0]["email"], + ) + self._triple( + g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"] + ) + self._triple( + g, + contact_details[1][2], + VCARD.hasEmail, + dataset_dict["contact"][1]["email"], + ) + + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + # Resources: standard fields + + assert self._triple( + g, distribution_ref, DCT.rights, dataset_dict["resources"][0]["rights"] + ) + + # Resources: list fields + + language = [ + str(t[2]) for t in g.triples((distribution_ref, DCT.language, None)) + ] + assert language == dataset_dict["resources"][0]["language"] + + # Resource: repeating subfields + access_services = [ + t for t in g.triples((distribution_ref, DCAT.accessService, None)) + ] + + assert len(access_services) == len(dataset["resources"][0]["access_services"]) + self._triple( + g, + access_services[0][2], + DCT.title, + dataset_dict["resources"][0]["access_services"][0]["title"], + ) + + endpoint_urls = [ + str(t[2]) + for t in g.triples((access_services[0][2], DCAT.endpointURL, None)) + ] + assert ( + endpoint_urls + == dataset_dict["resources"][0]["access_services"][0]["endpoint_url"] + ) From 2d8d969906886b8714cb3fb636698c24c8428fea Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 14 May 2024 14:44:25 +0200 Subject: [PATCH 10/52] [#56] [#56] Serialize repeating subfields --- ckanext/dcat/profiles.py | 26 +++++++++++++++++++++ ckanext/dcat/tests/test_scheming_support.py | 5 +++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index e5abab45..26001285 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -1336,6 +1336,32 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): _type=URIRef, value_modifier=self._add_mailto ) + # TODO: this will go into a separate profile + contact = dataset_dict.get("contact") + if isinstance(contact, list) and len(contact): + for item in contact: + contact_uri = item.get('uri') + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + g.add((contact_details, RDF.type, VCARD.Organization)) + g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict( + item, contact_details, + VCARD.fn, 'name' + ) + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + item, contact_details, + VCARD.hasEmail, 'email', + _type=URIRef, value_modifier=self._add_mailto + ) + + + # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index cbba044e..3769f361 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -90,5 +90,8 @@ def test_e2e_ckan_to_dcat(self): contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] - # TODO this will fail assert len(contact_details) == len(dataset["contact"]) + self._triple(g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"]) + self._triple(g, contact_details[0][2], VCARD.hasEmail, dataset_dict["contact"][0]["email"]) + self._triple(g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"]) + self._triple(g, contact_details[1][2], VCARD.hasEmail, dataset_dict["contact"][1]["email"]) From c5865fb03f9812e8be8f311b7524420074942131 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 15 May 2024 16:02:09 +0200 Subject: [PATCH 11/52] [#56] [#56] Add sample of resource fields --- ckanext/dcat/profiles.py | 86 ++++++++++---------- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 25 ++++++ ckanext/dcat/tests/test_scheming_support.py | 88 ++++++++++++++++++--- 3 files changed, 149 insertions(+), 50 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 26001285..02bd395c 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -1752,47 +1752,53 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(resource_dict, distribution, items) - try: - access_service_list = json.loads(resource_dict.get('access_services', '[]')) - # Access service - for access_service_dict in access_service_list: - - access_service_uri = access_service_dict.get('uri') - if access_service_uri: - access_service_node = CleanedURIRef(access_service_uri) - else: - access_service_node = BNode() - # Remember the (internal) access service reference for referencing in - # further profiles - access_service_dict['access_service_ref'] = str(access_service_node) - - self.g.add((distribution, DCAT.accessService, access_service_node)) - - self.g.add((access_service_node, RDF.type, DCAT.DataService)) - - # Simple values - items = [ - ('availability', DCATAP.availability, None, URIRefOrLiteral), - ('license', DCT.license, None, URIRefOrLiteral), - ('access_rights', DCT.accessRights, None, URIRefOrLiteral), - ('title', DCT.title, None, Literal), - ('endpoint_description', DCAT.endpointDescription, None, Literal), - ('description', DCT.description, None, Literal), - ] - - self._add_triples_from_dict(access_service_dict, access_service_node, items) + # TODO: this will go into a separate profile - # Lists - items = [ - ('endpoint_url', DCAT.endpointURL, None, URIRefOrLiteral), - ('serves_dataset', DCAT.servesDataset, None, URIRefOrLiteral), - ] - self._add_list_triples_from_dict(access_service_dict, access_service_node, items) - - if access_service_list: - resource_dict['access_services'] = json.dumps(access_service_list) - except ValueError: - pass + access_service_list = resource_dict.get('access_services', []) + if isinstance(access_service_list, str): + try: + access_service_list = json.loads(access_service_list) + except ValueError: + access_service_list = [] + + # Access service + for access_service_dict in access_service_list: + + access_service_uri = access_service_dict.get('uri') + if access_service_uri: + access_service_node = CleanedURIRef(access_service_uri) + else: + access_service_node = BNode() + # Remember the (internal) access service reference for referencing in + # further profiles + access_service_dict['access_service_ref'] = str(access_service_node) + + self.g.add((distribution, DCAT.accessService, access_service_node)) + + self.g.add((access_service_node, RDF.type, DCAT.DataService)) + + # Simple values + items = [ + ('availability', DCATAP.availability, None, URIRefOrLiteral), + ('license', DCT.license, None, URIRefOrLiteral), + ('access_rights', DCT.accessRights, None, URIRefOrLiteral), + ('title', DCT.title, None, Literal), + ('endpoint_description', DCAT.endpointDescription, None, Literal), + ('description', DCT.description, None, Literal), + ] + + self._add_triples_from_dict(access_service_dict, access_service_node, items) + + # Lists + items = [ + ('endpoint_url', DCAT.endpointURL, None, URIRefOrLiteral), + ('serves_dataset', DCAT.servesDataset, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(access_service_dict, access_service_node, items) + + # TODO: re-enable when separating into a profile + # if access_service_list: + # resource_dict['access_services'] = json.dumps(access_service_list) def graph_from_catalog(self, catalog_dict, catalog_ref): diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index aff9ecce..20edc599 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -94,6 +94,7 @@ dataset_fields: - field_name: conforms_to label: Conforms to preset: multiple_text + validators: ignore_missing scheming_multiple_text resource_fields: @@ -114,3 +115,27 @@ resource_fields: - field_name: format label: Format preset: resource_format_autocomplete + +- field_name: rights + label: Rights + form_snippet: markdown.html + form_placeholder: Some statement about the rights associated with the resource + +- field_name: language + label: Language + preset: multiple_text + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 3769f361..aa49551a 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -55,7 +55,24 @@ def test_e2e_ckan_to_dcat(self): {"name": "Contact 1", "email": "contact1@example.org"}, {"name": "Contact 2", "email": "contact2@example.org"}, ], - # TODO: resources + "resources": [ + { + "name": "Resource 1", + "url": "https://example.com/data.csv", + "format": "CSV", + "rights": "Some stament about rights", + "language": ["en", "ca", "es"], + "access_services": [ + { + "title": "Access Service 1", + "endpoint_url": [ + "https://example.org/access_service/1", + "https://example.org/access_service/2", + ], + } + ], + } + ], } dataset = call_action("package_create", **dataset_dict) @@ -64,7 +81,7 @@ def test_e2e_ckan_to_dcat(self): assert dataset["conforms_to"][0] == "Standard 1" assert dataset["contact"][0]["name"] == "Contact 1" - s = RDFSerializer(profiles=["euro_dcat_ap"]) + s = RDFSerializer(profiles=["euro_dcat_ap_2"]) g = s.g dataset_ref = s.graph_from_dataset(dataset) @@ -81,17 +98,68 @@ def test_e2e_ckan_to_dcat(self): # List fields # TODO helper function - conforms = [t for t in g.triples((dataset_ref, DCT.conformsTo, None))] - assert len(conforms) == len(dataset["conforms_to"]) - for index, item in enumerate(conforms): - assert str(item[2]) == dataset["conforms_to"][index] + conforms_to = [ + str(t[2]) for t in g.triples((dataset_ref, DCT.conformsTo, None)) + ] + assert conforms_to == dataset["conforms_to"] # Repeating subfields contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] assert len(contact_details) == len(dataset["contact"]) - self._triple(g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"]) - self._triple(g, contact_details[0][2], VCARD.hasEmail, dataset_dict["contact"][0]["email"]) - self._triple(g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"]) - self._triple(g, contact_details[1][2], VCARD.hasEmail, dataset_dict["contact"][1]["email"]) + self._triple( + g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"] + ) + self._triple( + g, + contact_details[0][2], + VCARD.hasEmail, + dataset_dict["contact"][0]["email"], + ) + self._triple( + g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"] + ) + self._triple( + g, + contact_details[1][2], + VCARD.hasEmail, + dataset_dict["contact"][1]["email"], + ) + + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + # Resources: standard fields + + assert self._triple( + g, distribution_ref, DCT.rights, dataset_dict["resources"][0]["rights"] + ) + + # Resources: list fields + + language = [ + str(t[2]) for t in g.triples((distribution_ref, DCT.language, None)) + ] + assert language == dataset_dict["resources"][0]["language"] + + # Resource: repeating subfields + access_services = [ + t for t in g.triples((distribution_ref, DCAT.accessService, None)) + ] + + assert len(access_services) == len(dataset["resources"][0]["access_services"]) + self._triple( + g, + access_services[0][2], + DCT.title, + dataset_dict["resources"][0]["access_services"][0]["title"], + ) + + endpoint_urls = [ + str(t[2]) + for t in g.triples((access_services[0][2], DCAT.endpointURL, None)) + ] + assert ( + endpoint_urls + == dataset_dict["resources"][0]["access_services"][0]["endpoint_url"] + ) From a77d5c28a54019e81731133bc87f15ccf9cd8c3f Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 20 May 2024 12:34:08 +0200 Subject: [PATCH 12/52] [#56] Use profiles from config in CLI --- ckanext/dcat/cli.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/ckanext/dcat/cli.py b/ckanext/dcat/cli.py index 83d8b539..c0343174 100644 --- a/ckanext/dcat/cli.py +++ b/ckanext/dcat/cli.py @@ -6,7 +6,12 @@ import ckan.plugins.toolkit as tk import ckanext.dcat.utils as utils -from ckanext.dcat.processors import RDFParser, RDFSerializer, DEFAULT_RDF_PROFILES +from ckanext.dcat.processors import ( + RDFParser, + RDFSerializer, + DEFAULT_RDF_PROFILES, + RDF_PROFILES_CONFIG_OPTION, +) @click.group() @@ -24,6 +29,17 @@ def generate_static(output): utils.generate_static_json(output) +def _get_profiles(profiles): + if profiles: + profiles = profiles.split() + elif tk.config.get(RDF_PROFILES_CONFIG_OPTION): + profiles = tk.aslist(tk.config[RDF_PROFILES_CONFIG_OPTION]) + else: + profiles = None + + return profiles + + @dcat.command(context_settings={"show_default": True}) @click.argument("input", type=click.File(mode="r")) @click.option( @@ -40,8 +56,8 @@ def generate_static(output): @click.option( "-p", "--profiles", - default=" ".join(DEFAULT_RDF_PROFILES), - help="RDF profiles to use", + help=f"RDF profiles to use. If not provided will be read from config, " + "if not present there, the default will be used: {DEFAULT_RDF_PROFILES}", ) @click.option( "-P", "--pretty", is_flag=True, help="Make the output more human readable" @@ -63,8 +79,8 @@ def consume(input, output, format, profiles, pretty, compat_mode): """ contents = input.read() - if profiles: - profiles = profiles.split() + profiles = _get_profiles(profiles) + parser = RDFParser(profiles=profiles, compatibility_mode=compat_mode) parser.parse(contents, _format=format) @@ -92,8 +108,8 @@ def consume(input, output, format, profiles, pretty, compat_mode): @click.option( "-p", "--profiles", - default=" ".join(DEFAULT_RDF_PROFILES), - help="RDF profiles to use", + help=f"RDF profiles to use. If not provided will be read from config, " + "if not present there, the default will be used: {DEFAULT_RDF_PROFILES}", ) @click.option( "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)" @@ -112,12 +128,9 @@ def produce(input, output, format, profiles, compat_mode): """ contents = input.read() - if profiles: - profiles = profiles.split() - serializer = RDFSerializer( - profiles=profiles, - compatibility_mode=compat_mode - ) + profiles = _get_profiles(profiles) + + serializer = RDFSerializer(profiles=profiles, compatibility_mode=compat_mode) dataset = json.loads(contents) if isinstance(dataset, list): From 35657efecf0f4d1d3a03339ec11377f62186b58f Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 20 May 2024 13:30:18 +0200 Subject: [PATCH 13/52] [#56] Separate scheming compat profile, parsing --- ckanext/dcat/profiles.py | 146 ++++++++++++++++++++++++++++++--------- examples/dataset.rdf | 15 +++- setup.py | 1 + 3 files changed, 127 insertions(+), 35 deletions(-) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 02bd395c..857a53a3 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -731,6 +731,18 @@ def _schema_field(self, key): if field['field_name'] == key: return field + def _schema_resource_field(self, key): + ''' + Returns the schema field information if the provided key exists as a field in + the resources fields of the dataset schema (if one was provided) + ''' + if not self._dataset_schema: + return None + + for field in self._dataset_schema['resource_fields']: + if field['field_name'] == key: + return field + def _set_dataset_value(self, dataset_dict, key, value): ''' Sets the value for a given key in a CKAN dataset dict @@ -758,6 +770,15 @@ def _set_list_dataset_value(self, dataset_dict, key, value): else: return self._set_dataset_value(dataset_dict, key, json.dumps(value)) + def _set_list_resource_value(self, resource_dict, key, value): + schema_field = self._schema_resource_field(key) + if schema_field and 'scheming_multiple_text' in schema_field['validators']: + resource_dict[key] = value + else: + resource_dict[key] = json.dumps(value) + + return resource_dict + def _get_dataset_value(self, dataset_dict, key, default=None): ''' Returns the value for the given key on a CKAN dict @@ -1084,7 +1105,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): ): value = self._object_value(dataset_ref, predicate) if value: - self._set_dataset_value(dataset_dict, key, value) + dataset_dict['extras'].append({'key': key, 'value': value}) # Lists for key, predicate, in ( @@ -1101,7 +1122,8 @@ def parse_dataset(self, dataset_dict, dataset_ref): ): values = self._object_value_list(dataset_ref, predicate) if values: - self._set_list_dataset_value(dataset_dict, key, values) + dataset_dict['extras'].append({'key': key, + 'value': json.dumps(values)}) # Contact details contact = self._contact_details(dataset_ref, DCAT.contactPoint) @@ -1110,7 +1132,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): contact = self._contact_details(dataset_ref, ADMS.contactPoint) if contact: - for key in ('uri', 'name', 'email'): + for key in ('uri', 'name', 'email'): if contact.get(key): dataset_dict['extras'].append( {'key': 'contact_{0}'.format(key), @@ -1336,32 +1358,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): _type=URIRef, value_modifier=self._add_mailto ) - # TODO: this will go into a separate profile - contact = dataset_dict.get("contact") - if isinstance(contact, list) and len(contact): - for item in contact: - contact_uri = item.get('uri') - if contact_uri: - contact_details = CleanedURIRef(contact_uri) - else: - contact_details = BNode() - - g.add((contact_details, RDF.type, VCARD.Organization)) - g.add((dataset_ref, DCAT.contactPoint, contact_details)) - - self._add_triple_from_dict( - item, contact_details, - VCARD.fn, 'name' - ) - # Add mail address as URIRef, and ensure it has a mailto: prefix - self._add_triple_from_dict( - item, contact_details, - VCARD.hasEmail, 'email', - _type=URIRef, value_modifier=self._add_mailto - ) - - - # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), @@ -1752,8 +1748,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(resource_dict, distribution, items) - # TODO: this will go into a separate profile - access_service_list = resource_dict.get('access_services', []) if isinstance(access_service_list, str): try: @@ -1796,9 +1790,8 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(access_service_dict, access_service_node, items) - # TODO: re-enable when separating into a profile - # if access_service_list: - # resource_dict['access_services'] = json.dumps(access_service_list) + if access_service_list: + resource_dict['access_services'] = json.dumps(access_service_list) def graph_from_catalog(self, catalog_dict, catalog_ref): @@ -2097,3 +2090,88 @@ def _distribution_url_graph(self, distribution, resource_dict): def _distribution_numbers_graph(self, distribution, resource_dict): if resource_dict.get('size'): self.g.add((distribution, SCHEMA.contentSize, Literal(resource_dict['size']))) + + +# TODO: split all these classes in different files +class EuropeanDCATAPSchemingProfile(RDFProfile): + ''' + This is a compatibilty profile meant to add support for ckanext-scheming to the existing + `euro_dcat_ap` and `euro_dcat_ap_2` profiles. + + It does not add or remove any properties from these profiles, it just transforms the + resulting dataset_dict so it is compatible with a ckanext-scheming schema + + TODO: summarize changes and link to docs + ''' + + def parse_dataset(self, dataset_dict, dataset_ref): + + if not self._dataset_schema: + # Not using scheming + return dataset_dict + + # Move extras to root + + extras_to_remove = [] + extras = dataset_dict.get('extras', []) + for extra in extras: + if self._schema_field(extra['key']): + # This is a field defined in the dataset schema + dataset_dict[extra['key']] = extra['value'] + extras_to_remove.append(extra['key']) + + dataset_dict['extras'] = [e for e in extras if e['key'] not in extras_to_remove] + + + # Parse lists + def _parse_list_value(data_dict, field_name): + schema_field = self._schema_field(field_name) or self._schema_resource_field(field_name) + + if schema_field and 'scheming_multiple_text' in schema_field.get('validators', []): + if isinstance(data_dict[field_name], str): + try: + data_dict[field_name] = json.loads(data_dict[field_name]) + except ValueError: + pass + + for field_name in dataset_dict.keys(): + _parse_list_value(dataset_dict, field_name) + + for resource_dict in dataset_dict.get('resources', []): + for field_name in resource_dict.keys(): + _parse_list_value(resource_dict, field_name) + + + # Repeating subfields + for schema_field in self._dataset_schema['dataset_fields']: + if 'repeating_subfields' in schema_field: + # Check if existing extras need to be migrated + field_name = schema_field['field_name'] + new_extras = [] + new_dict = {} + for extra in dataset_dict.get('extras', []): + if extra['key'].startswith(f'{field_name}_'): + subfield = extra['key'][extra['key'].index('_') + 1:] + if subfield in [f['field_name'] for f in schema_field['repeating_subfields']]: + new_dict[subfield] = extra['value'] + else: + new_extras.append(extra) + else: + new_extras.append(extra) + if new_dict: + dataset_dict[field_name] = [new_dict] + dataset_dict['extras'] = new_extras + + for schema_field in self._dataset_schema['resource_fields']: + if 'repeating_subfields' in schema_field: + # Check if value needs to be load from JSON + field_name = schema_field['field_name'] + for resource_dict in dataset_dict.get('resources', []): + if resource_dict.get(field_name) and isinstance(resource_dict[field_name], str): + try: + # TODO: load only subfields in schema? + resource_dict[field_name] = json.loads(resource_dict[field_name]) + except ValueError: + pass + + return dataset_dict diff --git a/examples/dataset.rdf b/examples/dataset.rdf index fed71cc9..6b445dff 100644 --- a/examples/dataset.rdf +++ b/examples/dataset.rdf @@ -3,6 +3,7 @@ xmlns:time="http://www.w3.org/2006/time#" xmlns:dct="http://purl.org/dc/terms/" xmlns:dcat="http://www.w3.org/ns/dcat#" + xmlns:dcatap="http://data.europa.eu/r5r/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:schema="http://schema.org/" @@ -96,7 +97,19 @@ - + + + + Sparql-end Point + + This SPARQL end point allow to directly query the EU Whoiswho content (organization / membership / person) + SPARQL url description + + + + + + diff --git a/setup.py b/setup.py index fda14619..78fb19fa 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ [ckan.rdf.profiles] euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile + euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile schemaorg=ckanext.dcat.profiles:SchemaOrgProfile [babel.extractors] From e0f15f5a83c614b54f4d7e625236720e14073664 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 21 May 2024 11:56:52 +0200 Subject: [PATCH 14/52] [#56] e2e test DCAT -> CKAN --- ckanext/dcat/tests/test_scheming_support.py | 82 ++++++++++++++++++++- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index aa49551a..88d73e3f 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -5,7 +5,7 @@ from ckan.tests.helpers import call_action from ckanext.dcat import utils -from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.processors import RDFSerializer, RDFParser from ckanext.dcat.profiles import ( DCAT, DCT, @@ -22,7 +22,7 @@ GEOJSON_IMT, DISTRIBUTION_LICENSE_FALLBACK_CONFIG, ) -from ckanext.dcat.tests.utils import BaseSerializeTest +from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest @pytest.mark.usefixtures("with_plugins", "clean_db") @@ -31,7 +31,10 @@ "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" ) @pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") -class TestSchemingSupport(BaseSerializeTest): +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingSerializeSupport(BaseSerializeTest): def test_e2e_ckan_to_dcat(self): """ Create a dataset using the scheming schema, check that fields @@ -81,7 +84,7 @@ def test_e2e_ckan_to_dcat(self): assert dataset["conforms_to"][0] == "Standard 1" assert dataset["contact"][0]["name"] == "Contact 1" - s = RDFSerializer(profiles=["euro_dcat_ap_2"]) + s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(dataset) @@ -163,3 +166,74 @@ def test_e2e_ckan_to_dcat(self): endpoint_urls == dataset_dict["resources"][0]["access_services"][0]["endpoint_url"] ) + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" +) +@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan(self): + """ + Parse a DCAT RDF graph into a CKAN dataset dict, create a dataset with package_create + and check that all expected fields are there + """ + contents = self._get_file_contents("dataset.rdf") + + p = RDFParser() + + p.parse(contents) + + datasets = [d for d in p.datasets()] + + assert len(datasets) == 1 + + dataset_dict = datasets[0] + + dataset_dict["name"] = "test-dcat-1" + dataset = call_action("package_create", **dataset_dict) + + # Core fields + + assert dataset["title"] == "Zimbabwe Regional Geochemical Survey." + assert ( + dataset["notes"] + == "During the period 1982-86 a team of geologists from the British Geological Survey ..." + ) + assert dataset["url"] == "http://dataset.info.org" + assert dataset["version"] == "2.3" + assert dataset["license_id"] == "cc-nc" + assert sorted([t["name"] for t in dataset["tags"]]) == [ + "exploration", + "geochemistry", + "geology", + ] + + # Standard fields + assert dataset["version_notes"] == "New schema added" + + # List fields + assert dataset["conforms_to"] == ["Standard 1", "Standard 2"] + + # Repeating subfields + + assert dataset["contact"][0]["name"] == "Point of Contact" + assert dataset["contact"][0]["email"] == "contact@some.org" + + resource = dataset["resources"][0] + # Resources: standard fields + assert resource["rights"] == "Some statement about rights" + + # Resources: list fields + assert sorted(resource["language"]) == ["ca", "en", "es"] + + # Resources: repeating subfields + assert resource["access_services"][0]["title"] == "Sparql-end Point" + assert resource["access_services"][0]["endpoint_url"] == [ + "http://publications.europa.eu/webapi/rdf/sparql" + ] From 0b6a8dd32470d2ff423eea65bbe57e63bae25156 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 21 May 2024 13:00:53 +0200 Subject: [PATCH 15/52] [#56] Scheming compatibility profile, serialization --- ckanext/dcat/plugins/__init__.py | 6 ++- ckanext/dcat/profiles.py | 73 ++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 6ff59f14..a6788106 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -134,12 +134,14 @@ def set_titles(object_dict): def before_dataset_index(self, dataset_dict): schema = None - schema_show = p.toolkit.get_action("scheming_dataset_schema_show") - if schema_show: + try: + schema_show = p.toolkit.get_action("scheming_dataset_schema_show") try: schema = schema_show({}, {"type": dataset_dict["type"]}) except p.toolkit.ObjectNotFound: pass + except KeyError: + pass if schema: for field in schema['dataset_fields']: diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 2af94192..1c929998 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -145,8 +145,8 @@ def __init__(self, graph, dataset_schema='dataset', compatibility_mode=False): # _license(). self._licenceregister_cache = None - schema_show = toolkit.get_action("scheming_dataset_schema_show") - if schema_show: + try: + schema_show = toolkit.get_action("scheming_dataset_schema_show") try: schema = schema_show({}, {"type": dataset_schema}) except toolkit.ObjectNotFound: @@ -154,6 +154,9 @@ def __init__(self, graph, dataset_schema='dataset', compatibility_mode=False): self._dataset_schema = schema + except KeyError: + pass + def _datasets(self): ''' Generator that returns all DCAT datasets on the graph @@ -1132,7 +1135,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): contact = self._contact_details(dataset_ref, ADMS.contactPoint) if contact: - for key in ('uri', 'name', 'email'): + for key in ('uri', 'name', 'email'): if contact.get(key): dataset_dict['extras'].append( {'key': 'contact_{0}'.format(key), @@ -1358,32 +1361,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): _type=URIRef, value_modifier=self._add_mailto ) - # TODO: this will go into a separate profile - contact = dataset_dict.get("contact") - if isinstance(contact, list) and len(contact): - for item in contact: - contact_uri = item.get('uri') - if contact_uri: - contact_details = CleanedURIRef(contact_uri) - else: - contact_details = BNode() - - g.add((contact_details, RDF.type, VCARD.Organization)) - g.add((dataset_ref, DCAT.contactPoint, contact_details)) - - self._add_triple_from_dict( - item, contact_details, - VCARD.fn, 'name' - ) - # Add mail address as URIRef, and ensure it has a mailto: prefix - self._add_triple_from_dict( - item, contact_details, - VCARD.hasEmail, 'email', - _type=URIRef, value_modifier=self._add_mailto - ) - - - # Publisher if any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), @@ -1774,6 +1751,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(resource_dict, distribution, items) + # Access services access_service_list = resource_dict.get('access_services', []) if isinstance(access_service_list, str): try: @@ -1781,7 +1759,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): except ValueError: access_service_list = [] - # Access service for access_service_dict in access_service_list: access_service_uri = access_service_dict.get('uri') @@ -1817,7 +1794,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_list_triples_from_dict(access_service_dict, access_service_node, items) if access_service_list: - resource_dict['access_services'] = json.dumps(access_service_list) + resource_dict['access_services'] = json.dumps(access_service_list) def graph_from_catalog(self, catalog_dict, catalog_ref): @@ -2201,3 +2178,37 @@ def _parse_list_value(data_dict, field_name): pass return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + contact = dataset_dict.get("contact") + if isinstance(contact, list) and len(contact): + for item in contact: + contact_uri = item.get('uri') + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + self.g.add((contact_details, RDF.type, VCARD.Organization)) + self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict( + item, contact_details, + VCARD.fn, 'name' + ) + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + item, contact_details, + VCARD.hasEmail, 'email', + _type=URIRef, value_modifier=self._add_mailto + ) + + resources = dataset_dict.get('resources', []) + for resource in resources: + if resource.get('access_services'): + if isinstance(resource['access_services'], str): + try: + resource['access_services'] = json.loads(resource['access_services']) + except ValueError: + pass From 20ac269c28b4fcf48b7a6fa18e14812dfb55f46a Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 21 May 2024 13:55:53 +0200 Subject: [PATCH 16/52] [#56] Install scheming in github actions --- .github/workflows/test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c4312706..188eb9b2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -54,11 +54,13 @@ jobs: pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup harvest extension + - name: Setup other extension run: | git clone https://github.com/ckan/ckanext-harvest pip install -e ckanext-harvest - pip install -r ckanext-harvest/pip-requirements.txt + pip install -r ckanext-harvest/requirements.txt + git clone https://github.com/ckan/ckanext-scheming + pip install -e ckanext-scheming - name: Setup extension run: | ckan -c test.ini db init From 5375232b392b83226802dd17705b17ff8d43d200 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 21 May 2024 14:25:32 +0200 Subject: [PATCH 17/52] [#56] Add CKAN<2.10 before index hook variant --- ckanext/dcat/plugins/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index a6788106..27d02df8 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -107,6 +107,9 @@ def get_auth_functions(self): def after_show(self, context, data_dict): return self.after_dataset_show(context, data_dict) + def before_index(self, dataset_dict): + return self.before_dataset_index(dataset_dict) + # CKAN >= 2.10 hooks def after_dataset_show(self, context, data_dict): From 9b0abce388ae6e65b7922a1186818aa8a2296145 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 23 May 2024 13:44:22 +0200 Subject: [PATCH 18/52] [#56] dataset_schema -> dataset_type --- ckanext/dcat/processors.py | 9 ++++++--- ckanext/dcat/profiles.py | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index c45c5d1c..92b15c4a 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -33,12 +33,15 @@ class RDFProcessor(object): - def __init__(self, profiles=None, dataset_schema='dataset', compatibility_mode=False): + def __init__(self, profiles=None, dataset_type='dataset', compatibility_mode=False): ''' Creates a parser or serializer instance You can optionally pass a list of profiles to be used. + A scheming dataset type can be provided, in which case the scheming schema + will be loaded by the base profile so it can be used by other profiles. + In compatibility mode, some fields are modified to maintain compatibility with previous versions of the ckanext-dcat parsers (eg adding the `dcat_` prefix or storing comma separated lists instead @@ -56,7 +59,7 @@ def __init__(self, profiles=None, dataset_schema='dataset', compatibility_mode=F raise RDFProfileException( 'No suitable RDF profiles could be loaded') - self.dataset_schema = dataset_schema + self.dataset_type = dataset_type if not compatibility_mode: compatibility_mode = p.toolkit.asbool( @@ -181,7 +184,7 @@ def datasets(self): for profile_class in self._profiles: profile = profile_class( self.g, - dataset_schema=self.dataset_schema, + dataset_type=self.dataset_type, compatibility_mode=self.compatibility_mode ) profile.parse_dataset(dataset_dict, dataset_ref) diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 1c929998..fdd3680c 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -126,11 +126,14 @@ class RDFProfile(object): _dataset_schema = None - def __init__(self, graph, dataset_schema='dataset', compatibility_mode=False): + def __init__(self, graph, dataset_type='dataset', compatibility_mode=False): '''Class constructor Graph is an rdflib.Graph instance. + A scheming dataset type can be provided, in which case the scheming schema + will be loaded so it can be used by profiles. + In compatibility mode, some fields are modified to maintain compatibility with previous versions of the ckanext-dcat parsers (eg adding the `dcat_` prefix or storing comma separated lists instead @@ -148,9 +151,9 @@ def __init__(self, graph, dataset_schema='dataset', compatibility_mode=False): try: schema_show = toolkit.get_action("scheming_dataset_schema_show") try: - schema = schema_show({}, {"type": dataset_schema}) + schema = schema_show({}, {"type": dataset_type}) except toolkit.ObjectNotFound: - raise toolkit.ObjectNotFound(f"Unknown dataset schema: {dataset_schema}") + raise toolkit.ObjectNotFound(f"Unknown dataset schema: {dataset_type}") self._dataset_schema = schema From e1b5f324b11f2b7a94068c5d64839164aa9720ac Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 27 May 2024 14:37:23 +0200 Subject: [PATCH 19/52] [#56] Add most DCAT AP 1.1 standard and list fields --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 139 ++++++++++++++----- ckanext/dcat/tests/test_scheming_support.py | 144 +++++++++++++++++++- ckanext/dcat/tests/utils.py | 3 + 3 files changed, 246 insertions(+), 40 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 20edc599..a3ddf67a 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -23,8 +23,23 @@ dataset_fields: form_snippet: markdown.html form_placeholder: eg. Some useful notes about the data +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + - field_name: tag_string - label: Tags + label: Keywords preset: tag_string_autocomplete form_placeholder: eg. economy, mental health, government @@ -48,55 +63,70 @@ dataset_fields: validators: ignore_missing unicode_safe package_version_validator form_placeholder: '1.0' -- field_name: author - label: Author - form_placeholder: Joe Bloggs - display_property: dc:creator - -- field_name: author_email - label: Author Email - form_placeholder: joe@example.com - display_property: dc:creator - display_snippet: email.html - display_email_name_field: author - -- field_name: maintainer - label: Maintainer - form_placeholder: Joe Bloggs - display_property: dc:contributor - -- field_name: maintainer_email - label: Maintainer Email - form_placeholder: joe@example.com - display_property: dc:contributor - display_snippet: email.html - display_email_name_field: maintainer +# Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + # TODO: dcat_date preset -- field_name: contact - label: Contact points - repeating_label: Contact point - repeating_subfields: +# Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + # TODO: dcat_date preset - - field_name: uri - label: URI +- field_name: identifier + label: Identifier - - field_name: name - label: Name +- field_name: frequency + label: Frequency - - field_name: email - label: Email +- field_name: provenance + label: Provenance + +- field_name: dcat_type + label: Type + # TODO: controlled vocabulary? + # +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + form_snippet: markdown.html - field_name: version_notes label: Version notes validators: ignore_missing unicode_safe form_snippet: markdown.html +- field_name: alternate_identifier + label: Alternate identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + - field_name: conforms_to label: Conforms to preset: multiple_text validators: ignore_missing scheming_multiple_text +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) resource_fields: - field_name: url @@ -116,15 +146,51 @@ resource_fields: label: Format preset: resource_format_autocomplete +- field_name: size + label: Size + # TODO: number validator / snippet + - field_name: rights label: Rights form_snippet: markdown.html form_placeholder: Some statement about the rights associated with the resource +- field_name: status + label: Status + +- field_name: license + label: License + +# Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + +# Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + +- field_name: issued + label: Release date + # TODO: dcat_date preset + +- field_name: modified + label: Modification date + # TODO: dcat_date preset + - field_name: language label: Language preset: multiple_text +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + - field_name: access_services label: Access services repeating_label: Access service @@ -139,3 +205,8 @@ resource_fields: - field_name: endpoint_url label: Endpoint URL preset: multiple_text + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 88d73e3f..3d12c506 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -1,6 +1,7 @@ import pytest from rdflib.namespace import RDF +from rdflib.term import URIRef from ckan.tests.helpers import call_action @@ -50,8 +51,23 @@ def test_e2e_ckan_to_dcat(self): "version": "1.0b", "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], # Standard fields + "issued": "2024-05-01", + "modified": "2024-05-05", + "identifier": "xx-some-dataset-id-yy", + "frequency": "monthly", + "provenance": "Statement about provenance", + "dcat_type": "test-type", "version_notes": "Some version notes", + "access_rights": "Statement about access rights", # List fields (lists) + "alternate_identifier": ["alt-id-1", "alt-id-2"], + "theme": [ + "https://example.org/uri/theme1", + "https://example.org/uri/theme2", + "https://example.org/uri/theme3", + ], + "language": ["en", "ca", "es"], + "documentation": ["https://example.org/some-doc.html"], "conforms_to": ["Standard 1", "Standard 2"], # Repeating subfields "contact": [ @@ -63,6 +79,12 @@ def test_e2e_ckan_to_dcat(self): "name": "Resource 1", "url": "https://example.com/data.csv", "format": "CSV", + "status": "published", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "license": "http://creativecommons.org/licenses/by/3.0/", "rights": "Some stament about rights", "language": ["en", "ca", "es"], "access_services": [ @@ -95,16 +117,53 @@ def test_e2e_ckan_to_dcat(self): assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset) assert self._triple(g, dataset_ref, DCT.title, dataset["title"]) assert self._triple(g, dataset_ref, DCT.description, dataset["notes"]) + assert self._triple(g, dataset_ref, OWL.versionInfo, dataset["version"]) # Standard fields + assert self._triple(g, dataset_ref, DCT.identifier, dataset["identifier"]) + assert self._triple( + g, dataset_ref, DCT.accrualPeriodicity, dataset["frequency"] + ) + assert self._triple(g, dataset_ref, DCT.provenance, dataset["provenance"]) + assert self._triple(g, dataset_ref, DCT.type, dataset["dcat_type"]) assert self._triple(g, dataset_ref, ADMS.versionNotes, dataset["version_notes"]) + assert self._triple(g, dataset_ref, DCT.accessRights, dataset["access_rights"]) + + # Dates + assert self._triple( + g, + dataset_ref, + DCT.issued, + dataset["issued"] + "T00:00:00", + data_type=XSD.dateTime, + ) + assert self._triple( + g, + dataset_ref, + DCT.modified, + dataset["modified"] + "T00:00:00", + data_type=XSD.dateTime, + ) # List fields - # TODO helper function - conforms_to = [ - str(t[2]) for t in g.triples((dataset_ref, DCT.conformsTo, None)) - ] - assert conforms_to == dataset["conforms_to"] + + assert ( + self._triples_list_values(g, dataset_ref, DCT.conformsTo) + == dataset["conforms_to"] + ) + assert ( + self._triples_list_values(g, dataset_ref, ADMS.identifier) + == dataset["alternate_identifier"] + ) + assert self._triples_list_values(g, dataset_ref, DCAT.theme) == dataset["theme"] + assert ( + self._triples_list_values(g, dataset_ref, DCT.language) + == dataset["language"] + ) + assert ( + self._triples_list_values(g, dataset_ref, FOAF.page) + == dataset["documentation"] + ) # Repeating subfields @@ -137,6 +196,37 @@ def test_e2e_ckan_to_dcat(self): assert self._triple( g, distribution_ref, DCT.rights, dataset_dict["resources"][0]["rights"] ) + assert self._triple( + g, distribution_ref, DCT.status, dataset_dict["resources"][0]["status"] + ) + assert self._triple( + g, + distribution_ref, + DCAT.accessURL, + dataset_dict["resources"][0]["access_url"], + ) + assert self._triple( + g, + distribution_ref, + DCAT.downloadURL, + dataset_dict["resources"][0]["download_url"], + ) + + # Resources: dates + assert self._triple( + g, + distribution_ref, + DCT.issued, + dataset["resources"][0]["issued"], + data_type=XSD.dateTime, + ) + assert self._triple( + g, + distribution_ref, + DCT.modified, + dataset["resources"][0]["modified"], + data_type=XSD.dateTime, + ) # Resources: list fields @@ -216,9 +306,31 @@ def test_e2e_dcat_to_ckan(self): # Standard fields assert dataset["version_notes"] == "New schema added" + assert dataset["identifier"] == u"9df8df51-63db-37a8-e044-0003ba9b0d98" + assert dataset["frequency"] == "http://purl.org/cld/freq/daily" + assert dataset["access_rights"] == "public" + assert dataset["provenance"] == "Some statement about provenance" + assert dataset["dcat_type"] == "test-type" + + assert dataset["issued"] == u"2012-05-10" + assert dataset["modified"] == u"2012-05-10T21:04:00" # List fields - assert dataset["conforms_to"] == ["Standard 1", "Standard 2"] + assert sorted(dataset["conforms_to"]) == ["Standard 1", "Standard 2"] + assert sorted(dataset["language"]) == ["ca", "en", "es"] + assert sorted(dataset["theme"]) == [ + "Earth Sciences", + "http://eurovoc.europa.eu/100142", + "http://eurovoc.europa.eu/209065", + ] + assert sorted(dataset["alternate_identifier"]) == [ + "alternate-identifier-1", + "alternate-identifier-2", + ] + assert sorted(dataset["documentation"]) == [ + "http://dataset.info.org/doc1", + "http://dataset.info.org/doc2", + ] # Repeating subfields @@ -226,11 +338,31 @@ def test_e2e_dcat_to_ckan(self): assert dataset["contact"][0]["email"] == "contact@some.org" resource = dataset["resources"][0] + + # Resources: core fields + assert resource["url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + # Resources: standard fields + assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" assert resource["rights"] == "Some statement about rights" + assert resource["issued"] == "2012-05-11" + assert resource["modified"] == "2012-05-01T00:04:06" + assert resource["status"] == "http://purl.org/adms/status/Completed" + assert resource["size"] == 12323 + + # assert resource['hash'] == u'4304cf2e751e6053c90b1804c89c0ebb758f395a' + # assert resource['hash_algorithm'] == u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1' + + assert resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + assert "download_url" not in resource # Resources: list fields assert sorted(resource["language"]) == ["ca", "en", "es"] + assert sorted(resource["documentation"]) == [ + "http://dataset.info.org/distribution1/doc1", + "http://dataset.info.org/distribution1/doc2", + ] + assert sorted(resource["conforms_to"]) == ["Standard 1", "Standard 2"] # Resources: repeating subfields assert resource["access_services"][0]["title"] == "Sparql-end Point" diff --git a/ckanext/dcat/tests/utils.py b/ckanext/dcat/tests/utils.py index c62d9338..8c0e8a18 100644 --- a/ckanext/dcat/tests/utils.py +++ b/ckanext/dcat/tests/utils.py @@ -41,6 +41,9 @@ def _triple(self, graph, subject, predicate, _object, data_type=None): triples = self._triples(graph, subject, predicate, _object, data_type) return triples[0] if triples else None + def _triples_list_values(self, graph, subject, predicate): + return [str(t[2]) for t in graph.triples((subject, predicate, None))] + def _get_typed_list(self, list, datatype): """ returns the list with the given rdf type """ return [datatype(x) for x in list] From 2e4b4bc4b12191635fbfd86506d51545bf217a97 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 28 May 2024 16:01:49 +0200 Subject: [PATCH 20/52] [#56] Test fixes --- ckanext/dcat/tests/test_scheming_support.py | 35 +++++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 3d12c506..33a507d5 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -77,6 +77,7 @@ def test_e2e_ckan_to_dcat(self): "resources": [ { "name": "Resource 1", + "description": "Some description", "url": "https://example.com/data.csv", "format": "CSV", "status": "published", @@ -170,19 +171,19 @@ def test_e2e_ckan_to_dcat(self): contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] assert len(contact_details) == len(dataset["contact"]) - self._triple( + assert self._triple( g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"] ) - self._triple( + assert self._triple( g, contact_details[0][2], VCARD.hasEmail, - dataset_dict["contact"][0]["email"], + URIRef("mailto:" + dataset_dict["contact"][0]["email"]), ) - self._triple( + assert self._triple( g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"] ) - self._triple( + assert self._triple( g, contact_details[1][2], VCARD.hasEmail, @@ -191,25 +192,37 @@ def test_e2e_ckan_to_dcat(self): distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + # Resources: core fields + + assert self._triple( + g, distribution_ref, DCT.title, dataset_dict["resources"][0]["name"] + ) + assert self._triple( + g, + distribution_ref, + DCT.description, + dataset_dict["resources"][0]["description"], + ) + # Resources: standard fields assert self._triple( g, distribution_ref, DCT.rights, dataset_dict["resources"][0]["rights"] ) assert self._triple( - g, distribution_ref, DCT.status, dataset_dict["resources"][0]["status"] + g, distribution_ref, ADMS.status, dataset_dict["resources"][0]["status"] ) assert self._triple( g, distribution_ref, DCAT.accessURL, - dataset_dict["resources"][0]["access_url"], + URIRef(dataset_dict["resources"][0]["access_url"]), ) assert self._triple( g, distribution_ref, DCAT.downloadURL, - dataset_dict["resources"][0]["download_url"], + URIRef(dataset_dict["resources"][0]["download_url"]), ) # Resources: dates @@ -241,7 +254,7 @@ def test_e2e_ckan_to_dcat(self): ] assert len(access_services) == len(dataset["resources"][0]["access_services"]) - self._triple( + assert self._triple( g, access_services[0][2], DCT.title, @@ -353,7 +366,9 @@ def test_e2e_dcat_to_ckan(self): # assert resource['hash'] == u'4304cf2e751e6053c90b1804c89c0ebb758f395a' # assert resource['hash_algorithm'] == u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1' - assert resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + assert ( + resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + ) assert "download_url" not in resource # Resources: list fields From f9467d4b4c41a4ddb18ad3cba3e5b16eb811557a Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 29 May 2024 09:57:21 +0200 Subject: [PATCH 21/52] [#56] Consolidate and simplify publisher handling Values for dct:publisher are obtained from the first one of these that are present: 1. A scheming `publisher` field (assuming the `euro_dcat_ap_scheming` profile is loaded) 2. The legacy `publisher_*` extras 3. The dataset's own organization For the last case, a sample schema for organizations has been added that implements all the publisher properties supported by the processors. --- ckanext/dcat/profiles.py | 112 ++++++++++++------ ckanext/dcat/schemas/dcat_ap_2.1.yaml | 33 +++++- .../dcat/schemas/publisher_organization.yaml | 35 ++++++ .../form_snippets/repeating_subfields.html | 8 ++ .../test_euro_dcatap_profile_serialize.py | 11 +- ckanext/dcat/tests/test_scheming_support.py | 100 +++++++++++++++- 6 files changed, 257 insertions(+), 42 deletions(-) create mode 100644 ckanext/dcat/schemas/publisher_organization.yaml create mode 100644 ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index fdd3680c..b16f15d9 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -126,6 +126,13 @@ class RDFProfile(object): _dataset_schema = None + # Cache for mappings of licenses URL/title to ID built when needed in + # _license(). + _licenceregister_cache = None + + # Cache for organization_show details (used for publisher fallback) + _org_cache: dict = {} + def __init__(self, graph, dataset_type='dataset', compatibility_mode=False): '''Class constructor @@ -144,10 +151,6 @@ def __init__(self, graph, dataset_type='dataset', compatibility_mode=False): self.compatibility_mode = compatibility_mode - # Cache for mappings of licenses URL/title to ID built when needed in - # _license(). - self._licenceregister_cache = None - try: schema_show = toolkit.get_action("scheming_dataset_schema_show") try: @@ -1365,45 +1368,61 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) # Publisher - if any([ + publisher_ref = None + + if dataset_dict.get('publisher'): + # Scheming publisher field: will be handled in a separate profile + pass + elif any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), - dataset_dict.get('organization'), ]): - + # Legacy publisher_* extras publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri') - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) + publisher_ref = CleanedURIRef(publisher_uri) else: # No publisher_uri - publisher_details = BNode() - - g.add((publisher_details, RDF.type, FOAF.Organization)) - g.add((dataset_ref, DCT.publisher, publisher_details)) - - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. - if not publisher_name and not publisher_uri and dataset_dict.get('organization'): - publisher_name = dataset_dict['organization']['title'] - - g.add((publisher_details, FOAF.name, Literal(publisher_name))) - # TODO: It would make sense to fallback these to organization - # fields but they are not in the default schema and the - # `organization` object in the dataset_dict does not include - # custom fields + publisher_ref = BNode() + publisher_details = { + 'name': publisher_name, + 'email': self._get_dataset_value(dataset_dict, 'publisher_email'), + 'url': self._get_dataset_value(dataset_dict, 'publisher_url'), + 'type': self._get_dataset_value(dataset_dict, 'publisher_type'), + } + elif dataset_dict.get('organization'): + # Fall back to dataset org + org_id = dataset_dict['organization']['id'] + org_dict = None + if org_id in self._org_cache: + org_dict = self._org_cache[org_id] + else: + try: + org_dict = toolkit.get_action('organization_show')( + {'ignore_auth': True}, {'id': org_id}) + self._org_cache[org_id] = org_dict + except toolkit.ObjectNotFound: + pass + if org_dict: + publisher_ref = CleanedURIRef(publisher_uri_organization_fallback(dataset_dict)) + publisher_details = { + 'name': org_dict.get('title'), + 'email': org_dict.get('email'), + 'url': org_dict.get('url'), + 'type': org_dict.get('dcat_type'), + } + # Add to graph + if publisher_ref: + g.add((publisher_ref, RDF.type, FOAF.Organization)) + g.add((dataset_ref, DCT.publisher, publisher_ref)) items = [ - ('publisher_email', FOAF.mbox, None, Literal), - ('publisher_url', FOAF.homepage, None, URIRef), - ('publisher_type', DCT.type, None, URIRefOrLiteral), + ('name', FOAF.name, None, Literal), + ('email', FOAF.mbox, None, Literal), + ('url', FOAF.homepage, None, URIRef), + ('type', DCT.type, None, URIRefOrLiteral), ] - - self._add_triples_from_dict(dataset_dict, publisher_details, items) + self._add_triples_from_dict(publisher_details, publisher_ref, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') @@ -2207,6 +2226,33 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): _type=URIRef, value_modifier=self._add_mailto ) + publisher = dataset_dict.get("publisher") + if isinstance(publisher, list) and len(publisher): + publisher = publisher[0] + publisher_uri = publisher.get('uri') + if publisher_uri: + publisher_ref = CleanedURIRef(publisher_uri) + else: + publisher_ref = BNode() + + self.g.add((publisher_ref, RDF.type, FOAF.Organization)) + self.g.add((dataset_ref, DCT.publisher, publisher_ref)) + + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.name, 'name' + ) + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.homepage, 'url', URIRef + ) + self._add_triple_from_dict( + publisher, publisher_ref, DCT.type, 'type', URIRefOrLiteral + ) + self._add_triple_from_dict( + publisher, publisher_ref, + VCARD.hasEmail, 'email', + _type=URIRef, value_modifier=self._add_mailto + ) + resources = dataset_dict.get('resources', []) for resource in resources: if resource.get('access_services'): diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index a3ddf67a..f5373c3a 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -3,7 +3,6 @@ dataset_type: dataset about: A reimplementation of the default CKAN dataset schema about_url: http://github.com/ckan/ckanext-dcat - dataset_fields: - field_name: title @@ -23,6 +22,11 @@ dataset_fields: form_snippet: markdown.html form_placeholder: eg. Some useful notes about the data +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + - field_name: contact label: Contact points repeating_label: Contact point @@ -38,10 +42,28 @@ dataset_fields: label: Email display_snippet: email.html -- field_name: tag_string - label: Keywords - preset: tag_string_autocomplete - form_placeholder: eg. economy, mental health, government +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type - field_name: license_id label: License @@ -209,4 +231,3 @@ resource_fields: # Note: if not provided, this will be autogenerated - field_name: uri label: URI - diff --git a/ckanext/dcat/schemas/publisher_organization.yaml b/ckanext/dcat/schemas/publisher_organization.yaml new file mode 100644 index 00000000..3d1f7d3b --- /dev/null +++ b/ckanext/dcat/schemas/publisher_organization.yaml @@ -0,0 +1,35 @@ +scheming_version: 2 +about_url: http://github.com/ckan/ckanext-dcat +description: > + An organization schema that implements the properties supported + by default in the dct:publisher property of a dcat:Dataset + +fields: + +- field_name: title + label: Name + validators: ignore_missing unicode_safe + form_snippet: large_text.html + form_attrs: {data-module: slug-preview-target} + +- field_name: name + label: URL + validators: not_empty unicode_safe name_validator group_name_validator + form_snippet: slug.html + form_placeholder: my-theme + +- field_name: notes + label: Description + form_snippet: markdown.html + form_placeholder: A little information about this organization. + +- field_name: email + label: Email + display_snippet: email.html + +- field_name: url + label: URL + display_snippet: link.html + +- field_name: dcat_type + label: Type diff --git a/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html b/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html new file mode 100644 index 00000000..dec11f45 --- /dev/null +++ b/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html @@ -0,0 +1,8 @@ +{% ckan_extends %} + +{% block add_button %} + {# Hide the Add button if we only want one set of subfields #} + {% if not field.repeating_once %} + {{ super() }} + {% endif %} +{% endblock %} diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index a389acfd..fd167736 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -1,6 +1,7 @@ from builtins import str from builtins import object import json +import uuid import pytest @@ -17,7 +18,7 @@ from ckanext.dcat import utils from ckanext.dcat.processors import RDFSerializer, HYDRA from ckanext.dcat.profiles import (DCAT, DCT, ADMS, XSD, VCARD, FOAF, SCHEMA, - SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT, + SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT, DISTRIBUTION_LICENSE_FALLBACK_CONFIG) from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS from ckanext.dcat.tests.utils import BaseSerializeTest @@ -398,11 +399,17 @@ def test_publisher_extras(self): assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type'])) def test_publisher_org(self): + org_id = str(uuid.uuid4()) + factories.Organization( + id=org_id, + name='publisher1', + title='Example Publisher from Org' + ) dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'organization': { - 'id': '', + 'id': org_id, 'name': 'publisher1', 'title': 'Example Publisher from Org', } diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 33a507d5..ffa682b7 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -3,6 +3,7 @@ from rdflib.namespace import RDF from rdflib.term import URIRef +from ckan.tests import factories from ckan.tests.helpers import call_action from ckanext.dcat import utils @@ -74,6 +75,14 @@ def test_e2e_ckan_to_dcat(self): {"name": "Contact 1", "email": "contact1@example.org"}, {"name": "Contact 2", "email": "contact2@example.org"}, ], + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body", + }, + ], "resources": [ { "name": "Resource 1", @@ -187,7 +196,32 @@ def test_e2e_ckan_to_dcat(self): g, contact_details[1][2], VCARD.hasEmail, - dataset_dict["contact"][1]["email"], + URIRef("mailto:" + dataset_dict["contact"][1]["email"]), + ) + + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + + assert len(publisher) == 1 + assert self._triple( + g, publisher[0][2], FOAF.name, dataset_dict["publisher"][0]["name"] + ) + assert self._triple( + g, + publisher[0][2], + VCARD.hasEmail, + URIRef("mailto:" + dataset_dict["publisher"][0]["email"]), + ) + assert self._triple( + g, + publisher[0][2], + FOAF.homepage, + dataset_dict["publisher"][0]["url"], + ) + assert self._triple( + g, + publisher[0][2], + DCT.type, + dataset_dict["publisher"][0]["type"], ) distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] @@ -270,6 +304,60 @@ def test_e2e_ckan_to_dcat(self): == dataset_dict["resources"][0]["access_services"][0]["endpoint_url"] ) + def test_publisher_fallback_org(self): + + org = factories.Organization( + title="Some publisher org", + ) + dataset_dict = { + "name": "test-dataset-2", + "title": "Test DCAT dataset 2", + "notes": "Lorem ipsum", + "owner_org": org["id"], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + + assert len(publisher) == 1 + assert self._triple(g, publisher[0][2], FOAF.name, org["title"]) + + def test_publisher_fallback_org_ignored_if_publisher_field_present(self): + + org = factories.Organization() + dataset_dict = { + "name": "test-dataset-2", + "title": "Test DCAT dataset 2", + "notes": "Lorem ipsum", + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body", + }, + ], + "owner_org": org["id"], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + + assert len(publisher) == 1 + assert self._triple( + g, publisher[0][2], FOAF.name, dataset_dict["publisher"][0]["name"] + ) + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -350,6 +438,16 @@ def test_e2e_dcat_to_ckan(self): assert dataset["contact"][0]["name"] == "Point of Contact" assert dataset["contact"][0]["email"] == "contact@some.org" + assert ( + dataset["publisher"][0]["name"] == "Publishing Organization for dataset 1" + ) + assert dataset["publisher"][0]["email"] == "contact@some.org" + assert dataset["publisher"][0]["url"] == "http://some.org" + assert ( + dataset["publisher"][0]["type"] + == "http://purl.org/adms/publishertype/NonProfitOrganisation" + ) + resource = dataset["resources"][0] # Resources: core fields From 1bce8344793ff92bb172d17f4ab4cc7383e18d11 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 29 May 2024 11:10:06 +0200 Subject: [PATCH 22/52] Fix merge errors --- ckanext/dcat/profiles/__init__.py | 1 + ckanext/dcat/profiles/base.py | 93 ++++++++- ckanext/dcat/profiles/euro_dcat_ap.py | 83 ++++---- ckanext/dcat/profiles/euro_dcat_ap_2.py | 188 +++++++++--------- .../dcat/profiles/euro_dcat_ap_scheming.py | 166 ++++++++++++++++ ckanext/dcat/tests/test_scheming_support.py | 3 - 6 files changed, 388 insertions(+), 146 deletions(-) create mode 100644 ckanext/dcat/profiles/euro_dcat_ap_scheming.py diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py index 92266c72..a80a48c6 100644 --- a/ckanext/dcat/profiles/__init__.py +++ b/ckanext/dcat/profiles/__init__.py @@ -20,4 +20,5 @@ from .euro_dcat_ap import EuropeanDCATAPProfile from .euro_dcat_ap_2 import EuropeanDCATAP2Profile +from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile from .schemaorg import SchemaOrgProfile diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 4b652b91..c91a1e8e 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -7,7 +7,7 @@ from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS from geomet import wkt, InvalidGeoJSONException -from ckantoolkit import config, url_for, asbool, get_action +from ckantoolkit import config, url_for, asbool, get_action, ObjectNotFound from ckan.model.license import LicenseRegister from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS @@ -41,7 +41,7 @@ "spdx": SPDX, } -PREFIX_MAILTO = u"mailto:" +PREFIX_MAILTO = "mailto:" GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json" @@ -105,11 +105,20 @@ class RDFProfile(object): custom profiles """ - def __init__(self, graph, compatibility_mode=False): - """Class constructor + _dataset_schema = None - Graph is an rdflib.Graph instance. + # Cache for mappings of licenses URL/title to ID built when needed in + # _license(). + _licenceregister_cache = None + # Cache for organization_show details (used for publisher fallback) + _org_cache: dict = {} + + def __init__(self, graph, dataset_type="dataset", compatibility_mode=False): + """Class constructor + Graph is an rdflib.Graph instance. + A scheming dataset type can be provided, in which case the scheming schema + will be loaded so it can be used by profiles. In compatibility mode, some fields are modified to maintain compatibility with previous versions of the ckanext-dcat parsers (eg adding the `dcat_` prefix or storing comma separated lists instead @@ -120,9 +129,17 @@ def __init__(self, graph, compatibility_mode=False): self.compatibility_mode = compatibility_mode - # Cache for mappings of licenses URL/title to ID built when needed in - # _license(). - self._licenceregister_cache = None + try: + schema_show = get_action("scheming_dataset_schema_show") + try: + schema = schema_show({}, {"type": dataset_type}) + except ObjectNotFound: + raise ObjectNotFound(f"Unknown dataset schema: {dataset_type}") + + self._dataset_schema = schema + + except KeyError: + pass def _datasets(self): """ @@ -707,6 +724,64 @@ def _add_spatial_to_dict(self, dataset_dict, key, spatial): } ) + def _schema_field(self, key): + """ + Returns the schema field information if the provided key exists as a field in + the dataset schema (if one was provided) + """ + if not self._dataset_schema: + return None + + for field in self._dataset_schema["dataset_fields"]: + if field["field_name"] == key: + return field + + def _schema_resource_field(self, key): + """ + Returns the schema field information if the provided key exists as a field in + the resources fields of the dataset schema (if one was provided) + """ + if not self._dataset_schema: + return None + + for field in self._dataset_schema["resource_fields"]: + if field["field_name"] == key: + return field + + def _set_dataset_value(self, dataset_dict, key, value): + """ + Sets the value for a given key in a CKAN dataset dict + If a dataset schema was provided, the schema will be checked to see if + a custom field is present for the key. If so the key will be stored at + the dict root level, otherwise it will be stored as an extra. + Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored + at the root level. + """ + if self._schema_field(key) or key in ROOT_DATASET_FIELDS: + dataset_dict[key] = value + else: + if not dataset_dict.get("extras"): + dataset_dict["extras"] = [] + dataset_dict["extras"].append({"key": key, "value": value}) + + return dataset_dict + + def _set_list_dataset_value(self, dataset_dict, key, value): + schema_field = self._schema_field(key) + if schema_field and "scheming_multiple_text" in schema_field["validators"]: + return self._set_dataset_value(dataset_dict, key, value) + else: + return self._set_dataset_value(dataset_dict, key, json.dumps(value)) + + def _set_list_resource_value(self, resource_dict, key, value): + schema_field = self._schema_resource_field(key) + if schema_field and "scheming_multiple_text" in schema_field["validators"]: + resource_dict[key] = value + else: + resource_dict[key] = json.dumps(value) + + return resource_dict + def _get_dataset_value(self, dataset_dict, key, default=None): """ Returns the value for the given key on a CKAN dict @@ -880,7 +955,7 @@ def _without_mailto(self, mail_addr): Ensures that the mail address string has no mailto: prefix. """ if mail_addr: - return str(mail_addr).replace(PREFIX_MAILTO, u"") + return str(mail_addr).replace(PREFIX_MAILTO, "") else: return mail_addr diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py index 9a4c853b..b7e4cae4 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_dcat_ap.py @@ -20,11 +20,9 @@ DCAT, DCT, ADMS, - XSD, VCARD, FOAF, SCHEMA, - SKOS, LOCN, GSP, OWL, @@ -354,51 +352,66 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) # Publisher - if any( + publisher_ref = None + + if dataset_dict.get("publisher"): + # Scheming publisher field: will be handled in a separate profile + pass + elif any( [ self._get_dataset_value(dataset_dict, "publisher_uri"), self._get_dataset_value(dataset_dict, "publisher_name"), - dataset_dict.get("organization"), ] ): - + # Legacy publisher_* extras publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri") - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) publisher_name = self._get_dataset_value(dataset_dict, "publisher_name") if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) + publisher_ref = CleanedURIRef(publisher_uri) else: # No publisher_uri - publisher_details = BNode() - - g.add((publisher_details, RDF.type, FOAF.Organization)) - g.add((dataset_ref, DCT.publisher, publisher_details)) - - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. - if ( - not publisher_name - and not publisher_uri - and dataset_dict.get("organization") - ): - publisher_name = dataset_dict["organization"]["title"] - - g.add((publisher_details, FOAF.name, Literal(publisher_name))) - # TODO: It would make sense to fallback these to organization - # fields but they are not in the default schema and the - # `organization` object in the dataset_dict does not include - # custom fields + publisher_ref = BNode() + publisher_details = { + "name": publisher_name, + "email": self._get_dataset_value(dataset_dict, "publisher_email"), + "url": self._get_dataset_value(dataset_dict, "publisher_url"), + "type": self._get_dataset_value(dataset_dict, "publisher_type"), + } + elif dataset_dict.get("organization"): + # Fall back to dataset org + org_id = dataset_dict["organization"]["id"] + org_dict = None + if org_id in self._org_cache: + org_dict = self._org_cache[org_id] + else: + try: + org_dict = toolkit.get_action("organization_show")( + {"ignore_auth": True}, {"id": org_id} + ) + self._org_cache[org_id] = org_dict + except toolkit.ObjectNotFound: + pass + if org_dict: + publisher_ref = CleanedURIRef( + publisher_uri_organization_fallback(dataset_dict) + ) + publisher_details = { + "name": org_dict.get("title"), + "email": org_dict.get("email"), + "url": org_dict.get("url"), + "type": org_dict.get("dcat_type"), + } + # Add to graph + if publisher_ref: + g.add((publisher_ref, RDF.type, FOAF.Organization)) + g.add((dataset_ref, DCT.publisher, publisher_ref)) items = [ - ("publisher_email", FOAF.mbox, None, Literal), - ("publisher_url", FOAF.homepage, None, URIRef), - ("publisher_type", DCT.type, None, URIRefOrLiteral), + ("name", FOAF.name, None, Literal), + ("email", FOAF.mbox, None, Literal), + ("url", FOAF.homepage, None, URIRef), + ("type", DCT.type, None, URIRefOrLiteral), ] - - self._add_triples_from_dict(dataset_dict, publisher_details, items) + self._add_triples_from_dict(publisher_details, publisher_ref, items) # Temporal start = self._get_dataset_value(dataset_dict, "temporal_start") diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 6f40e3ab..3db77d61 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -91,56 +91,52 @@ def parse_dataset(self, dataset_dict, dataset_ref): if values: resource_dict[key] = json.dumps(values) - # Access services - access_service_list = [] + # Access services + access_service_list = [] - for access_service in self.g.objects( - distribution, DCAT.accessService + for access_service in self.g.objects( + distribution, DCAT.accessService + ): + access_service_dict = {} + + # Simple values + for key, predicate in ( + ("availability", DCATAP.availability), + ("title", DCT.title), + ("endpoint_description", DCAT.endpointDescription), + ("license", DCT.license), + ("access_rights", DCT.accessRights), + ("description", DCT.description), + ): + value = self._object_value(access_service, predicate) + if value: + access_service_dict[key] = value + # List + for key, predicate in ( + ("endpoint_url", DCAT.endpointURL), + ("serves_dataset", DCAT.servesDataset), ): - access_service_dict = {} - - # Simple values - for key, predicate in ( - ("availability", DCATAP.availability), - ("title", DCT.title), - ("endpoint_description", DCAT.endpointDescription), - ("license", DCT.license), - ("access_rights", DCT.accessRights), - ("description", DCT.description), - ): - value = self._object_value(access_service, predicate) - if value: - access_service_dict[key] = value - # List - for key, predicate in ( - ("endpoint_url", DCAT.endpointURL), - ("serves_dataset", DCAT.servesDataset), - ): - values = self._object_value_list( - access_service, predicate - ) - if values: - access_service_dict[key] = values - - # Access service URI (explicitly show the missing ones) - access_service_dict["uri"] = ( - str(access_service) - if isinstance(access_service, URIRef) - else "" - ) - - # Remember the (internal) access service reference for referencing in - # further profiles, e.g. for adding more properties - access_service_dict["access_service_ref"] = str( - access_service - ) - - access_service_list.append(access_service_dict) - - if access_service_list: - resource_dict["access_services"] = json.dumps( - access_service_list - ) + values = self._object_value_list(access_service, predicate) + if values: + access_service_dict[key] = values + + # Access service URI (explicitly show the missing ones) + access_service_dict["uri"] = ( + str(access_service) + if isinstance(access_service, URIRef) + else "" + ) + + # Remember the (internal) access service reference for referencing in + # further profiles, e.g. for adding more properties + access_service_dict["access_service_ref"] = str(access_service) + + access_service_list.append(access_service_dict) + + if access_service_list: + resource_dict["access_services"] = json.dumps( + access_service_list + ) return dataset_dict @@ -253,60 +249,54 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ] self._add_list_triples_from_dict(resource_dict, distribution, items) - try: - access_service_list = json.loads( - resource_dict.get("access_services", "[]") + # Access services + access_service_list = resource_dict.get("access_services", []) + if isinstance(access_service_list, str): + try: + access_service_list = json.loads(access_service_list) + except ValueError: + access_service_list = [] + + for access_service_dict in access_service_list: + + access_service_uri = access_service_dict.get("uri") + if access_service_uri: + access_service_node = CleanedURIRef(access_service_uri) + else: + access_service_node = BNode() + # Remember the (internal) access service reference for referencing in + # further profiles + access_service_dict["access_service_ref"] = str(access_service_node) + + self.g.add((distribution, DCAT.accessService, access_service_node)) + + self.g.add((access_service_node, RDF.type, DCAT.DataService)) + + # Simple values + items = [ + ("availability", DCATAP.availability, None, URIRefOrLiteral), + ("license", DCT.license, None, URIRefOrLiteral), + ("access_rights", DCT.accessRights, None, URIRefOrLiteral), + ("title", DCT.title, None, Literal), + ("endpoint_description", DCAT.endpointDescription, None, Literal), + ("description", DCT.description, None, Literal), + ] + + self._add_triples_from_dict( + access_service_dict, access_service_node, items ) - # Access service - for access_service_dict in access_service_list: - - access_service_uri = access_service_dict.get("uri") - if access_service_uri: - access_service_node = CleanedURIRef(access_service_uri) - else: - access_service_node = BNode() - # Remember the (internal) access service reference for referencing in - # further profiles - access_service_dict["access_service_ref"] = str( - access_service_node - ) - - self.g.add((distribution, DCAT.accessService, access_service_node)) - - self.g.add((access_service_node, RDF.type, DCAT.DataService)) - - # Simple values - items = [ - ("availability", DCATAP.availability, None, URIRefOrLiteral), - ("license", DCT.license, None, URIRefOrLiteral), - ("access_rights", DCT.accessRights, None, URIRefOrLiteral), - ("title", DCT.title, None, Literal), - ( - "endpoint_description", - DCAT.endpointDescription, - None, - Literal, - ), - ("description", DCT.description, None, Literal), - ] - - self._add_triples_from_dict( - access_service_dict, access_service_node, items - ) - # Lists - items = [ - ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral), - ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral), - ] - self._add_list_triples_from_dict( - access_service_dict, access_service_node, items - ) + # Lists + items = [ + ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral), + ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict( + access_service_dict, access_service_node, items + ) - if access_service_list: - resource_dict["access_services"] = json.dumps(access_service_list) - except ValueError: - pass + if access_service_list: + resource_dict["access_services"] = json.dumps(access_service_list) def graph_from_catalog(self, catalog_dict, catalog_ref): diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py new file mode 100644 index 00000000..6bd570a9 --- /dev/null +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -0,0 +1,166 @@ +import json + +from rdflib import URIRef, BNode +from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral +from .base import ( + RDF, + XSD, + DCAT, + DCT, + VCARD, + FOAF, +) + + +class EuropeanDCATAPSchemingProfile(RDFProfile): + """ + This is a compatibilty profile meant to add support for ckanext-scheming to the existing + `euro_dcat_ap` and `euro_dcat_ap_2` profiles. + It does not add or remove any properties from these profiles, it just transforms the + resulting dataset_dict so it is compatible with a ckanext-scheming schema + TODO: summarize changes and link to docs + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + + if not self._dataset_schema: + # Not using scheming + return dataset_dict + + # Move extras to root + + extras_to_remove = [] + extras = dataset_dict.get("extras", []) + for extra in extras: + if self._schema_field(extra["key"]): + # This is a field defined in the dataset schema + dataset_dict[extra["key"]] = extra["value"] + extras_to_remove.append(extra["key"]) + + dataset_dict["extras"] = [e for e in extras if e["key"] not in extras_to_remove] + + # Parse lists + def _parse_list_value(data_dict, field_name): + schema_field = self._schema_field( + field_name + ) or self._schema_resource_field(field_name) + + if schema_field and "scheming_multiple_text" in schema_field.get( + "validators", [] + ): + if isinstance(data_dict[field_name], str): + try: + data_dict[field_name] = json.loads(data_dict[field_name]) + except ValueError: + pass + + for field_name in dataset_dict.keys(): + _parse_list_value(dataset_dict, field_name) + + for resource_dict in dataset_dict.get("resources", []): + for field_name in resource_dict.keys(): + _parse_list_value(resource_dict, field_name) + + # Repeating subfields + for schema_field in self._dataset_schema["dataset_fields"]: + if "repeating_subfields" in schema_field: + # Check if existing extras need to be migrated + field_name = schema_field["field_name"] + new_extras = [] + new_dict = {} + for extra in dataset_dict.get("extras", []): + if extra["key"].startswith(f"{field_name}_"): + subfield = extra["key"][extra["key"].index("_") + 1 :] + if subfield in [ + f["field_name"] for f in schema_field["repeating_subfields"] + ]: + new_dict[subfield] = extra["value"] + else: + new_extras.append(extra) + else: + new_extras.append(extra) + if new_dict: + dataset_dict[field_name] = [new_dict] + dataset_dict["extras"] = new_extras + + for schema_field in self._dataset_schema["resource_fields"]: + if "repeating_subfields" in schema_field: + # Check if value needs to be load from JSON + field_name = schema_field["field_name"] + for resource_dict in dataset_dict.get("resources", []): + if resource_dict.get(field_name) and isinstance( + resource_dict[field_name], str + ): + try: + # TODO: load only subfields in schema? + resource_dict[field_name] = json.loads( + resource_dict[field_name] + ) + except ValueError: + pass + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + contact = dataset_dict.get("contact") + if isinstance(contact, list) and len(contact): + for item in contact: + contact_uri = item.get("uri") + if contact_uri: + contact_details = CleanedURIRef(contact_uri) + else: + contact_details = BNode() + + self.g.add((contact_details, RDF.type, VCARD.Organization)) + self.g.add((dataset_ref, DCAT.contactPoint, contact_details)) + + self._add_triple_from_dict(item, contact_details, VCARD.fn, "name") + # Add mail address as URIRef, and ensure it has a mailto: prefix + self._add_triple_from_dict( + item, + contact_details, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + + publisher = dataset_dict.get("publisher") + if isinstance(publisher, list) and len(publisher): + publisher = publisher[0] + publisher_uri = publisher.get("uri") + if publisher_uri: + publisher_ref = CleanedURIRef(publisher_uri) + else: + publisher_ref = BNode() + + self.g.add((publisher_ref, RDF.type, FOAF.Organization)) + self.g.add((dataset_ref, DCT.publisher, publisher_ref)) + + self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name") + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.homepage, "url", URIRef + ) + self._add_triple_from_dict( + publisher, publisher_ref, DCT.type, "type", URIRefOrLiteral + ) + self._add_triple_from_dict( + publisher, + publisher_ref, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + + resources = dataset_dict.get("resources", []) + for resource in resources: + if resource.get("access_services"): + if isinstance(resource["access_services"], str): + try: + resource["access_services"] = json.loads( + resource["access_services"] + ) + except ValueError: + pass diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index ffa682b7..e2508e0f 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -20,9 +20,6 @@ LOCN, GSP, OWL, - SPDX, - GEOJSON_IMT, - DISTRIBUTION_LICENSE_FALLBACK_CONFIG, ) from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest From cd1d3f0822208e1e8291cdec787710cbeb4ba038 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 30 May 2024 10:36:54 +0200 Subject: [PATCH 23/52] [#56] Add temporal extent --- .../dcat/profiles/euro_dcat_ap_scheming.py | 21 +++++- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 14 +++- ckanext/dcat/tests/test_scheming_support.py | 68 +++++++++++++++++++ 3 files changed, 101 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 6bd570a9..4353d2a7 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -9,6 +9,9 @@ DCT, VCARD, FOAF, + SCHEMA, + SKOS, + LOCN, ) @@ -62,14 +65,18 @@ def _parse_list_value(data_dict, field_name): _parse_list_value(resource_dict, field_name) # Repeating subfields + new_fields_mapping = { + "temporal_coverage": "temporal" + } for schema_field in self._dataset_schema["dataset_fields"]: if "repeating_subfields" in schema_field: # Check if existing extras need to be migrated field_name = schema_field["field_name"] new_extras = [] new_dict = {} + check_name = new_fields_mappings.get(field_name, field_name) for extra in dataset_dict.get("extras", []): - if extra["key"].startswith(f"{field_name}_"): + if extra["key"].startswith(f"{check_name}_"): subfield = extra["key"][extra["key"].index("_") + 1 :] if subfield in [ f["field_name"] for f in schema_field["repeating_subfields"] @@ -83,6 +90,7 @@ def _parse_list_value(data_dict, field_name): dataset_dict[field_name] = [new_dict] dataset_dict["extras"] = new_extras + # Repeating subfields: resources for schema_field in self._dataset_schema["resource_fields"]: if "repeating_subfields" in schema_field: # Check if value needs to be load from JSON @@ -154,6 +162,17 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): value_modifier=self._add_mailto, ) + temporal = dataset_dict.get("temporal_coverage") + if isinstance(temporal, list) and len(temporal): + for item in temporal: + temporal_ref = BNode() + self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime)) + if item.get("start"): + self._add_date_triple(temporal_ref, SCHEMA.startDate, item["start"]) + if item.get("end"): + self._add_date_triple(temporal_ref, SCHEMA.endDate, item["end"]) + self.g.add((dataset_ref, DCT.temporal, temporal_ref)) + resources = dataset_dict.get("resources", []) for resource in resources: if resource.get("access_services"): diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index f5373c3a..3a848751 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -107,7 +107,19 @@ dataset_fields: - field_name: dcat_type label: Type # TODO: controlled vocabulary? - # + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + # TODO: dcat_date preset + + - field_name: end + label: End + # TODO: dcat_date preset + - field_name: access_rights label: Access rights validators: ignore_missing unicode_safe diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index e2508e0f..f37d57f0 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -80,6 +80,10 @@ def test_e2e_ckan_to_dcat(self): "type": "public_body", }, ], + "temporal_coverage": [ + {"start": "1905-03-01", "end": "2013-01-05"}, + {"start": "2024-04-10", "end": "2024-05-29"}, + ], "resources": [ { "name": "Resource 1", @@ -221,6 +225,38 @@ def test_e2e_ckan_to_dcat(self): dataset_dict["publisher"][0]["type"], ) + temporal = [t for t in g.triples((dataset_ref, DCT.temporal, None))] + + assert len(temporal) == len(dataset["temporal_coverage"]) + assert self._triple( + g, + temporal[0][2], + SCHEMA.startDate, + dataset_dict["temporal_coverage"][0]["start"] + "T00:00:00", + data_type=XSD.dateTime, + ) + assert self._triple( + g, + temporal[0][2], + SCHEMA.endDate, + dataset_dict["temporal_coverage"][0]["end"] + "T00:00:00", + data_type=XSD.dateTime, + ) + assert self._triple( + g, + temporal[1][2], + SCHEMA.startDate, + dataset_dict["temporal_coverage"][1]["start"] + "T00:00:00", + data_type=XSD.dateTime, + ) + assert self._triple( + g, + temporal[1][2], + SCHEMA.endDate, + dataset_dict["temporal_coverage"][1]["end"] + "T00:00:00", + data_type=XSD.dateTime, + ) + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] # Resources: core fields @@ -355,6 +391,36 @@ def test_publisher_fallback_org_ignored_if_publisher_field_present(self): g, publisher[0][2], FOAF.name, dataset_dict["publisher"][0]["name"] ) + def test_legacy_fields(self): + + dataset_dict = { + "name": "test-dataset-2", + "title": "Test DCAT dataset 2", + "notes": "Lorem ipsum", + "extras": [ + {"key": "contact_name", "value": "Test Contact"}, + {"key": "contact_email", "value": "contact@example.org"}, + {"key": "publisher_name", "value": "Test Publisher"}, + {"key": "publisher_email", "value": "publisher@example.org"}, + {"key": "publisher_url", "value": "https://example.org"}, + {"key": "publisher_type", "value": "public_body"}, + ], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] + assert len(contact_details) == 1 + assert self._triple(g, contact_details[0][2], VCARD.fn, "Test Contact") + + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + assert len(publisher) == 1 + assert self._triple(g, publisher[0][2], FOAF.name, "Test Publisher") + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -444,6 +510,8 @@ def test_e2e_dcat_to_ckan(self): dataset["publisher"][0]["type"] == "http://purl.org/adms/publishertype/NonProfitOrganisation" ) + assert dataset["temporal_coverage"][0]["start"] == "1905-03-01" + assert dataset["temporal_coverage"][0]["end"] == "2013-01-05" resource = dataset["resources"][0] From 103aa08eb791d1181315cbf131cfeb42008eba76 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 30 May 2024 12:18:22 +0200 Subject: [PATCH 24/52] [#56] Add support for spatial_coverage New repeating subfield, supporting all properties for the location class: uri, text, geom, bbox and centroid. Used spatial_coverage as name to not interfere with the `spatial` field expected by ckanext-scheming, in a future commit we will extract the relevant value to index it as a geometry. --- ckanext/dcat/plugins/__init__.py | 10 +-- ckanext/dcat/profiles/base.py | 6 +- .../dcat/profiles/euro_dcat_ap_scheming.py | 27 +++++++- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 19 ++++++ ckanext/dcat/tests/test_scheming_support.py | 61 ++++++++++++++++++- 5 files changed, 114 insertions(+), 9 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 27d02df8..291b7663 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -147,14 +147,16 @@ def before_dataset_index(self, dataset_dict): pass if schema: + # TODO: https://github.com/ckan/ckanext-dcat/pull/281#discussion_r1610549936 for field in schema['dataset_fields']: if field['field_name'] in dataset_dict and 'repeating_subfields' in field: for index, item in enumerate(dataset_dict[field['field_name']]): for key in item: - # Index a flattened version - new_key = f'{field["field_name"]}_{index}_{key}' - - dataset_dict[new_key] = dataset_dict[field['field_name']][index][key] + value = dataset_dict[field['field_name']][index][key] + if not isinstance(value, dict): + # Index a flattened version + new_key = f'{field["field_name"]}_{index}_{key}' + dataset_dict[new_key] = value dataset_dict.pop(field['field_name'], None) return dataset_dict diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index c91a1e8e..8299b718 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -702,17 +702,19 @@ def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): self.g.add((spatial_ref, predicate, Literal(value, datatype=GEOJSON_IMT))) # WKT, because GeoDCAT-AP says so try: + if isinstance(value, str): + value = json.loads(value) self.g.add( ( spatial_ref, predicate, Literal( - wkt.dumps(json.loads(value), decimals=4), + wkt.dumps(value, decimals=4), datatype=GSP.wktLiteral, ), ) ) - except (TypeError, ValueError, InvalidGeoJSONException): + except (TypeError, ValueError, InvalidGeoJSONException) as e: pass def _add_spatial_to_dict(self, dataset_dict, key, spatial): diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 4353d2a7..6ff50a39 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -1,6 +1,6 @@ import json -from rdflib import URIRef, BNode +from rdflib import URIRef, BNode, Literal from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral from .base import ( RDF, @@ -74,7 +74,7 @@ def _parse_list_value(data_dict, field_name): field_name = schema_field["field_name"] new_extras = [] new_dict = {} - check_name = new_fields_mappings.get(field_name, field_name) + check_name = new_fields_mapping.get(field_name, field_name) for extra in dataset_dict.get("extras", []): if extra["key"].startswith(f"{check_name}_"): subfield = extra["key"][extra["key"].index("_") + 1 :] @@ -173,6 +173,29 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_date_triple(temporal_ref, SCHEMA.endDate, item["end"]) self.g.add((dataset_ref, DCT.temporal, temporal_ref)) + spatial = dataset_dict.get("spatial_coverage") + if isinstance(spatial, list) and len(spatial): + for item in spatial: + if item.get("uri"): + spatial_ref = CleanedURIRef(item["uri"]) + else: + spatial_ref = BNode() + self.g.add((spatial_ref, RDF.type, DCT.Location)) + self.g.add((dataset_ref, DCT.spatial, spatial_ref)) + + if item.get("text"): + self.g.add((spatial_ref, SKOS.prefLabel, Literal(item["text"]))) + + for field in [ + ("geom", LOCN.geometry), + ("bbox", DCAT.bbox), + ("centroid", DCAT.centroid), + ]: + if item.get(field[0]): + self._add_spatial_value_to_graph( + spatial_ref, field[1], item[field[0]] + ) + resources = dataset_dict.get("resources", []) for resource in resources: if resource.get("access_services"): diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 3a848751..3c4b7232 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -120,6 +120,25 @@ dataset_fields: label: End # TODO: dcat_date preset +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + - field_name: access_rights label: Access rights validators: ignore_missing unicode_safe diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index f37d57f0..ae21f7b9 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -1,7 +1,7 @@ import pytest - from rdflib.namespace import RDF from rdflib.term import URIRef +from geomet import wkt from ckan.tests import factories from ckan.tests.helpers import call_action @@ -20,10 +20,15 @@ LOCN, GSP, OWL, + GEOJSON_IMT, ) from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest +# TODO: tests for spatial coverage +# TODO: index "spatial" extra + + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( @@ -84,6 +89,37 @@ def test_e2e_ckan_to_dcat(self): {"start": "1905-03-01", "end": "2013-01-05"}, {"start": "2024-04-10", "end": "2024-05-29"}, ], + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [ + [ + [11.9936, 54.0486], + [11.9936, 54.2466], + [12.3045, 54.2466], + [12.3045, 54.0486], + [11.9936, 54.0486], + ] + ], + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [-2.1604, 42.7611], + [-2.0938, 42.7611], + [-2.0938, 42.7931], + [-2.1604, 42.7931], + [-2.1604, 42.7611], + ] + ], + }, + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + } + ], "resources": [ { "name": "Resource 1", @@ -257,6 +293,29 @@ def test_e2e_ckan_to_dcat(self): data_type=XSD.dateTime, ) + spatial = [t for t in g.triples((dataset_ref, DCT.spatial, None))] + assert len(spatial) == len(dataset["spatial_coverage"]) + assert str(spatial[0][2]) == dataset["spatial_coverage"][0]["uri"] + assert self._triple(g, spatial[0][2], RDF.type, DCT.Location) + assert self._triple( + g, spatial[0][2], SKOS.prefLabel, dataset["spatial_coverage"][0]["text"] + ) + + assert len([t for t in g.triples((spatial[0][2], LOCN.geometry, None))]) == 2 + # Geometry in GeoJSON + assert self._triple( + g, + spatial[0][2], + LOCN.geometry, + dataset["spatial_coverage"][0]["geom"], + GEOJSON_IMT, + ) + # Geometry in WKT + wkt_geom = wkt.dumps( + dataset["spatial_coverage"][0]["geom"], decimals=4 + ) + assert self._triple(g, spatial[0][2], LOCN.geometry, wkt_geom, GSP.wktLiteral) + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] # Resources: core fields From a862d77766c6b0c25e1581af3e72a91697357877 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 30 May 2024 12:24:13 +0200 Subject: [PATCH 25/52] [#56] Add missing var --- ckanext/dcat/profiles/base.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 8299b718..23136496 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -45,6 +45,19 @@ GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json" +ROOT_DATASET_FIELDS = [ + 'name', + 'title', + 'url', + 'version', + 'tags', + 'license_id', + 'maintainer', + 'maintainer_email', + 'author', + 'author_email', +] + class URIRefOrLiteral(object): """Helper which creates an URIRef if the value appears to be an http URL, From aa23a706af3559cb39aed0f6d6c058344f3c67d2 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 30 May 2024 12:50:31 +0200 Subject: [PATCH 26/52] [#56] Update repeating subfields indexing logic The previous field names based on indexes didn't allow to retrieve results easily. We are now flattening all values for the same subfield to at least get a text hit. See https://github.com/ckan/ckanext-dcat/pull/281#discussion_r1610549936 --- ckanext/dcat/plugins/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 291b7663..fef6aa5d 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -147,16 +147,18 @@ def before_dataset_index(self, dataset_dict): pass if schema: - # TODO: https://github.com/ckan/ckanext-dcat/pull/281#discussion_r1610549936 for field in schema['dataset_fields']: if field['field_name'] in dataset_dict and 'repeating_subfields' in field: - for index, item in enumerate(dataset_dict[field['field_name']]): + for item in dataset_dict[field['field_name']]: for key in item: - value = dataset_dict[field['field_name']][index][key] + value = item[key] if not isinstance(value, dict): # Index a flattened version - new_key = f'{field["field_name"]}_{index}_{key}' - dataset_dict[new_key] = value + new_key = f'{field["field_name"]}__{key}' + if not dataset_dict.get(new_key): + dataset_dict[new_key] = "" + dataset_dict[new_key] += " " + value + dataset_dict.pop(field['field_name'], None) return dataset_dict From 4256e73a03ecf58a355121014a39bb439a548a4d Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 30 May 2024 14:24:15 +0200 Subject: [PATCH 27/52] [#56] Store geometry in spatial field for indexing If the `spatial_coverage` field is present, store the first geometry found so ckanext-spatial can pick it up for spatial indexing. Added indexing tests --- ckanext/dcat/plugins/__init__.py | 36 ++++++++- ckanext/dcat/tests/test_scheming_support.py | 82 ++++++++++++++++++++- 2 files changed, 111 insertions(+), 7 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index fef6aa5d..e424059b 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -2,6 +2,7 @@ from builtins import object import os +import json from ckantoolkit import config @@ -146,6 +147,7 @@ def before_dataset_index(self, dataset_dict): except KeyError: pass + spatial = None if schema: for field in schema['dataset_fields']: if field['field_name'] in dataset_dict and 'repeating_subfields' in field: @@ -156,10 +158,36 @@ def before_dataset_index(self, dataset_dict): # Index a flattened version new_key = f'{field["field_name"]}__{key}' if not dataset_dict.get(new_key): - dataset_dict[new_key] = "" - dataset_dict[new_key] += " " + value - - dataset_dict.pop(field['field_name'], None) + dataset_dict[new_key] = value + else: + dataset_dict[new_key] += ' ' + value + + subfields = dataset_dict.pop(field['field_name'], None) + if field['field_name'] == 'spatial_coverage': + spatial = subfields + + # Store the first geometry found so ckanext-spatial can pick it up for indexing + def _check_for_a_geom(spatial_dict): + value = None + + for field in ('geom', 'bbox', 'centroid'): + if spatial_dict.get(field): + value = spatial_dict[field] + if isinstance(value, dict): + try: + value = json.dumps(value) + break + except ValueError: + pass + return value + + if spatial and not dataset_dict.get('spatial'): + for item in spatial: + value = _check_for_a_geom(item) + if value: + dataset_dict['spatial'] = value + dataset_dict['extras_spatial'] = value + break return dataset_dict diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index ae21f7b9..978f25c2 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -1,3 +1,6 @@ +from unittest import mock +import json + import pytest from rdflib.namespace import RDF from rdflib.term import URIRef @@ -311,9 +314,7 @@ def test_e2e_ckan_to_dcat(self): GEOJSON_IMT, ) # Geometry in WKT - wkt_geom = wkt.dumps( - dataset["spatial_coverage"][0]["geom"], decimals=4 - ) + wkt_geom = wkt.dumps(dataset["spatial_coverage"][0]["geom"], decimals=4) assert self._triple(g, spatial[0][2], LOCN.geometry, wkt_geom, GSP.wktLiteral) distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] @@ -606,3 +607,78 @@ def test_e2e_dcat_to_ckan(self): assert resource["access_services"][0]["endpoint_url"] == [ "http://publications.europa.eu/webapi/rdf/sparql" ] + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" +) +@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingIndexFields: + def test_repeating_subfields_index(self): + + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Some notes", + # Repeating subfields + "contact": [ + {"name": "Contact 1", "email": "contact1@example.org"}, + {"name": "Contact 2", "email": "contact2@example.org"}, + ], + } + + with mock.patch("ckan.lib.search.index.make_connection") as m: + call_action("package_create", **dataset_dict) + + # Dict sent to Solr + search_dict = m.mock_calls[1].kwargs["docs"][0] + assert search_dict["contact__name"] == "Contact 1 Contact 2" + assert ( + search_dict["contact__email"] + == "contact1@example.org contact2@example.org" + ) + + def test_spatial_field(self): + + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Some notes", + "spatial_coverage": [ + { + "uri": "https://sws.geonames.org/6361390/", + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + }, + { + "geom": { + "type": "Polygon", + "coordinates": [ + [ + [11.9936, 54.0486], + [11.9936, 54.2466], + [12.3045, 54.2466], + [12.3045, 54.0486], + [11.9936, 54.0486], + ] + ], + }, + "text": "Tarragona", + }, + ], + } + + with mock.patch("ckan.lib.search.index.make_connection") as m: + call_action("package_create", **dataset_dict) + + # Dict sent to Solr + search_dict = m.mock_calls[1].kwargs["docs"][0] + assert search_dict["spatial"] == json.dumps( + dataset_dict["spatial_coverage"][0]["centroid"] + ) From afb74d1ee105334b922092a8389bb5539c9bd228 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 3 Jun 2024 15:47:19 +0200 Subject: [PATCH 28/52] [#56] Add rest of DCAT-AP 1 and 2.1 fields At least the ones supported by the current processors. TODO: * spatial_resolution in meters: needs a new multiple_text_decimal validator * hvd_category: will be done as part of the wider HVD work --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 56 +++++++++ ckanext/dcat/tests/test_scheming_support.py | 131 ++++++++++++++++---- examples/dataset.rdf | 11 ++ 3 files changed, 176 insertions(+), 22 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 3c4b7232..96fa2cb4 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -120,6 +120,11 @@ dataset_fields: label: End # TODO: dcat_date preset +- field_name: temporal_resolution + label: Temporal resolution + preset: multiple_text + validators: ignore_missing scheming_multiple_text + - field_name: spatial_coverage label: Spatial coverage repeating_subfields: @@ -139,6 +144,12 @@ dataset_fields: - field_name: centroid label: Centroid +#- field_name: spatial_resolution_in_meters +# label: Spatial resolution in meters +# preset: multiple_text +# validators: ignore_missing scheming_multiple_text +# TODO: scheming_multiple_decimal + - field_name: access_rights label: Access rights validators: ignore_missing unicode_safe @@ -175,6 +186,23 @@ dataset_fields: preset: multiple_text validators: ignore_missing scheming_multiple_text +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +#- field_name: hvd_category +# label: HVD Category +# preset: multiple_text +# validators: ignore_missing scheming_multiple_text +# TODO: implement separately as part of wider HVD support + + # Note: if not provided, this will be autogenerated - field_name: uri label: URI @@ -199,15 +227,37 @@ resource_fields: label: Format preset: resource_format_autocomplete +- field_name: mimetype + label: Media type + # TODO: get from format + +- field_name: compress_format + label: Compress format + # TODO: media type validator + +- field_name: package_format + label: Package format + # TODO: media type validator + - field_name: size label: Size # TODO: number validator / snippet +- field_name: hash + label: Hash + # TODO: generate for uploads? + +- field_name: hash_algorithm + label: Hash Algorithm + - field_name: rights label: Rights form_snippet: markdown.html form_placeholder: Some statement about the rights associated with the resource +- field_name: availability + label: Availability + - field_name: status label: Status @@ -233,6 +283,7 @@ resource_fields: - field_name: language label: Language preset: multiple_text + validators: ignore_missing scheming_multiple_text - field_name: documentation label: Documentation @@ -244,6 +295,11 @@ resource_fields: preset: multiple_text validators: ignore_missing scheming_multiple_text +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + - field_name: access_services label: Access services repeating_label: Access service diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 978f25c2..bce24665 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -13,6 +13,7 @@ from ckanext.dcat.processors import RDFSerializer, RDFParser from ckanext.dcat.profiles import ( DCAT, + DCATAP, DCT, ADMS, XSD, @@ -24,6 +25,7 @@ GSP, OWL, GEOJSON_IMT, + SPDX, ) from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest @@ -75,6 +77,14 @@ def test_e2e_ckan_to_dcat(self): "language": ["en", "ca", "es"], "documentation": ["https://example.org/some-doc.html"], "conforms_to": ["Standard 1", "Standard 2"], + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2018.22", + "test_isreferencedby", + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg_impl/2023/138/oj", + "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", + ], # Repeating subfields "contact": [ {"name": "Contact 1", "email": "contact1@example.org"}, @@ -92,6 +102,7 @@ def test_e2e_ckan_to_dcat(self): {"start": "1905-03-01", "end": "2013-01-05"}, {"start": "2024-04-10", "end": "2024-05-29"}, ], + "temporal_resolution": ["PT15M", "P1D"], "spatial_coverage": [ { "geom": { @@ -123,12 +134,19 @@ def test_e2e_ckan_to_dcat(self): "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, } ], + "spatial_resolution_in_meters": [1.5, 2.0], "resources": [ { "name": "Resource 1", "description": "Some description", "url": "https://example.com/data.csv", "format": "CSV", + "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", + "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", + "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", + "size": 12323, + "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", + "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", "status": "published", "access_url": "https://example.com/data.csv", "download_url": "https://example.com/data.csv", @@ -214,6 +232,24 @@ def test_e2e_ckan_to_dcat(self): self._triples_list_values(g, dataset_ref, FOAF.page) == dataset["documentation"] ) + assert ( + self._triples_list_values(g, dataset_ref, DCAT.temporalResolution) + == dataset["temporal_resolution"] + ) + assert ( + self._triples_list_values(g, dataset_ref, DCT.isReferencedBy) + == dataset["is_referenced_by"] + ) + assert ( + self._triples_list_values(g, dataset_ref, DCATAP.applicableLegislation) + == dataset["applicable_legislation"] + ) + + # TODO: enable after validator + # assert ( + # self._triples_list_values(g, dataset_ref, DCAT.spatialResolutionInMeters) + # == dataset["spatial_resolution_in_meters"] + # ) # Repeating subfields @@ -318,38 +354,67 @@ def test_e2e_ckan_to_dcat(self): assert self._triple(g, spatial[0][2], LOCN.geometry, wkt_geom, GSP.wktLiteral) distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + resource = dataset_dict["resources"][0] # Resources: core fields - assert self._triple( - g, distribution_ref, DCT.title, dataset_dict["resources"][0]["name"] - ) + assert self._triple(g, distribution_ref, DCT.title, resource["name"]) assert self._triple( g, distribution_ref, DCT.description, - dataset_dict["resources"][0]["description"], + resource["description"], ) # Resources: standard fields + assert self._triple(g, distribution_ref, DCT.rights, resource["rights"]) + assert self._triple(g, distribution_ref, ADMS.status, resource["status"]) assert self._triple( - g, distribution_ref, DCT.rights, dataset_dict["resources"][0]["rights"] + g, + distribution_ref, + DCAT.accessURL, + URIRef(resource["access_url"]), ) assert self._triple( - g, distribution_ref, ADMS.status, dataset_dict["resources"][0]["status"] + g, + distribution_ref, + DCATAP.availability, + URIRef(resource["availability"]), ) assert self._triple( g, distribution_ref, - DCAT.accessURL, - URIRef(dataset_dict["resources"][0]["access_url"]), + DCAT.compressFormat, + URIRef(resource["compress_format"]), + ) + assert self._triple( + g, + distribution_ref, + DCAT.packageFormat, + URIRef(resource["package_format"]), ) assert self._triple( g, distribution_ref, DCAT.downloadURL, - URIRef(dataset_dict["resources"][0]["download_url"]), + URIRef(resource["download_url"]), + ) + + assert self._triple(g, distribution_ref, DCAT.byteSize, float(resource['size']), XSD.decimal) + # Checksum + checksum = self._triple(g, distribution_ref, SPDX.checksum, None)[2] + assert checksum + assert self._triple(g, checksum, RDF.type, SPDX.Checksum) + assert self._triple( + g, + checksum, + SPDX.checksumValue, + resource["hash"], + data_type="http://www.w3.org/2001/XMLSchema#hexBinary", + ) + assert self._triple( + g, checksum, SPDX.algorithm, URIRef(resource["hash_algorithm"]) ) # Resources: dates @@ -369,11 +434,10 @@ def test_e2e_ckan_to_dcat(self): ) # Resources: list fields - - language = [ - str(t[2]) for t in g.triples((distribution_ref, DCT.language, None)) - ] - assert language == dataset_dict["resources"][0]["language"] + assert ( + self._triples_list_values(g, distribution_ref, DCT.language) + == resource["language"] + ) # Resource: repeating subfields access_services = [ @@ -385,17 +449,14 @@ def test_e2e_ckan_to_dcat(self): g, access_services[0][2], DCT.title, - dataset_dict["resources"][0]["access_services"][0]["title"], + resource["access_services"][0]["title"], ) endpoint_urls = [ str(t[2]) for t in g.triples((access_services[0][2], DCAT.endpointURL, None)) ] - assert ( - endpoint_urls - == dataset_dict["resources"][0]["access_services"][0]["endpoint_url"] - ) + assert endpoint_urls == resource["access_services"][0]["endpoint_url"] def test_publisher_fallback_org(self): @@ -555,7 +616,18 @@ def test_e2e_dcat_to_ckan(self): "http://dataset.info.org/doc1", "http://dataset.info.org/doc2", ] - + assert sorted(dataset["temporal_resolution"]) == [ + "P1D", + "PT15M", + ] + assert sorted(dataset["is_referenced_by"]) == [ + "https://doi.org/10.1038/sdata.2018.22", + "test_isreferencedby", + ] + assert sorted(dataset["applicable_legislation"]) == [ + "http://data.europa.eu/eli/reg_impl/2023/138/oj", + "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", + ] # Repeating subfields assert dataset["contact"][0]["name"] == "Point of Contact" @@ -585,9 +657,24 @@ def test_e2e_dcat_to_ckan(self): assert resource["modified"] == "2012-05-01T00:04:06" assert resource["status"] == "http://purl.org/adms/status/Completed" assert resource["size"] == 12323 + assert ( + resource["availability"] + == "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL" + ) + assert ( + resource["compress_format"] + == "http://www.iana.org/assignments/media-types/application/gzip" + ) + assert ( + resource["package_format"] + == "http://publications.europa.eu/resource/authority/file-type/TAR" + ) - # assert resource['hash'] == u'4304cf2e751e6053c90b1804c89c0ebb758f395a' - # assert resource['hash_algorithm'] == u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1' + assert resource["hash"] == "4304cf2e751e6053c90b1804c89c0ebb758f395a" + assert ( + resource["hash_algorithm"] + == "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" + ) assert ( resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" diff --git a/examples/dataset.rdf b/examples/dataset.rdf index 6b445dff..f7db02db 100644 --- a/examples/dataset.rdf +++ b/examples/dataset.rdf @@ -37,6 +37,8 @@ Standard 2 + 1.5 + 2.0 public @@ -50,6 +52,10 @@ + https://doi.org/10.1038/sdata.2018.22 + test_isreferencedby + + @@ -57,6 +63,8 @@ 2013-01-05 + PT15M + P1D Point of Contact @@ -80,9 +88,12 @@ Some statement about rights + http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL http://www.bgs.ac.uk/gbase/geochemcd/home.html HTML text/html + http://www.iana.org/assignments/media-types/application/gzip + http://publications.europa.eu/resource/authority/file-type/TAR 12323 From c6fc970319255045d6dc823ba780aca8d80ddcfb Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 4 Jun 2024 13:37:32 +0200 Subject: [PATCH 29/52] [#56] Add spatial_resolution_in_meters This required a new scheming_multiple_number validator, adapted from scheming_multiple_text --- ckanext/dcat/plugins/__init__.py | 6 ++ ckanext/dcat/schemas/dcat_ap_2.1.yaml | 9 ++- ckanext/dcat/tests/test_scheming_support.py | 4 -- ckanext/dcat/tests/test_validators.py | 62 +++++++++++++++++ ckanext/dcat/validators.py | 74 +++++++++++++++++++++ 5 files changed, 146 insertions(+), 9 deletions(-) create mode 100644 ckanext/dcat/tests/test_validators.py create mode 100644 ckanext/dcat/validators.py diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index e424059b..23e1424d 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -20,6 +20,7 @@ dcat_auth, ) from ckanext.dcat import utils +from ckanext.dcat.validators import dcat_validators CUSTOM_ENDPOINT_CONFIG = 'ckanext.dcat.catalog_endpoint' @@ -39,6 +40,7 @@ class DCATPlugin(p.SingletonPlugin, DefaultTranslation): p.implements(p.ITranslation, inherit=True) p.implements(p.IClick) p.implements(p.IBlueprint) + p.implements(p.IValidators) # IClick @@ -102,6 +104,10 @@ def get_auth_functions(self): 'dcat_catalog_search': dcat_auth, } + # IValidators + def get_validators(self): + return dcat_validators + # IPackageController # CKAN < 2.10 hooks diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 96fa2cb4..2e19dc2d 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -144,11 +144,10 @@ dataset_fields: - field_name: centroid label: Centroid -#- field_name: spatial_resolution_in_meters -# label: Spatial resolution in meters -# preset: multiple_text -# validators: ignore_missing scheming_multiple_text -# TODO: scheming_multiple_decimal +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + preset: multiple_text + validators: ignore_missing scheming_multiple_number - field_name: access_rights label: Access rights diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index bce24665..8b2da04f 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -30,10 +30,6 @@ from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest -# TODO: tests for spatial coverage -# TODO: index "spatial" extra - - @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( diff --git a/ckanext/dcat/tests/test_validators.py b/ckanext/dcat/tests/test_validators.py new file mode 100644 index 00000000..97edf17c --- /dev/null +++ b/ckanext/dcat/tests/test_validators.py @@ -0,0 +1,62 @@ +import json +import pytest + +from ckantoolkit import StopOnError +from ckanext.dcat.validators import scheming_multiple_number + + +def test_scheming_multiple_number(): + + expected_value = [1.5, 2.0, 0.345] + + key = ("some_number_field",) + errors = {key: []} + + values = [ + expected_value, + [1.5, 2, 0.345], + ["1.5", "2", ".345"], + ] + for value in values: + data = {key: value} + scheming_multiple_number({}, {})(key, data, errors, {}) + + assert data[key] == json.dumps(expected_value) + + +def test_scheming_multiple_number_single_value(): + + expected_value = [1.5] + + key = ("some_number_field",) + errors = {key: []} + + values = [ + expected_value, + 1.5, + "1.5", + ] + for value in values: + data = {key: value} + scheming_multiple_number({}, {})(key, data, errors, {}) + + assert data[key] == json.dumps(expected_value) + + +def test_scheming_multiple_number_wrong_value(): + + key = ("some_number_field",) + errors = {key: []} + + values = [ + ["a", 2, 0.345], + ["1..5", "2", ".345"], + ] + for value in values: + with pytest.raises(StopOnError): + data = {key: value} + scheming_multiple_number({}, {})(key, data, errors, {}) + + assert errors[key][0].startswith("invalid type for repeating number") + + errors = {key: []} diff --git a/ckanext/dcat/validators.py b/ckanext/dcat/validators.py new file mode 100644 index 00000000..9e3e110a --- /dev/null +++ b/ckanext/dcat/validators.py @@ -0,0 +1,74 @@ +import numbers +import json + +from ckantoolkit import ( + missing, + StopOnError, + _, +) +from ckanext.scheming.validation import scheming_validator + + +@scheming_validator +def scheming_multiple_number(field, schema): + """ + Accept repeating numbers input in the following forms and convert to a + json list of decimal values for storage. Also act like scheming_required + to check for at least one non-empty string when required is true: + + 1. a list of numbers, eg. + + [22, 1.3] + + 2. a single number value to allow single text fields to be + migrated to repeating numbers + + 33.4 + + """ + + def _scheming_multiple_number(key, data, errors, context): + # just in case there was an error before our validator, + # bail out here because our errors won't be useful + if errors[key]: + return + + value = data[key] + # 1. list of strings or 2. single string + if value is not missing: + if not isinstance(value, list): + try: + value = [float(value)] + except ValueError: + errors[key].append(_("expecting list of numbers")) + raise StopOnError + + out = [] + for element in value: + if not element: + continue + try: + element = float(element) + except ValueError: + errors[key].append( + _("invalid type for repeating number: %r") % element + ) + continue + + out.append(element) + + if errors[key]: + raise StopOnError + + data[key] = json.dumps(out) + + if (data[key] is missing or data[key] == "[]") and field.get("required"): + errors[key].append(_("Missing value")) + raise StopOnError + + return _scheming_multiple_number + + +dcat_validators = { + "scheming_multiple_number": scheming_multiple_number, +} From 99b4c893c49e65f1987f08b1e0cda017d45b8de1 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 4 Jun 2024 13:39:14 +0200 Subject: [PATCH 30/52] [#56] Review validators for resource fields --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 8 ++--- .../scheming/form_snippets/number.html | 16 +++++++++ ckanext/dcat/tests/test_scheming_support.py | 33 ++++++++++++++++++- 3 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 ckanext/dcat/templates/scheming/form_snippets/number.html diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 2e19dc2d..e00426a1 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -228,23 +228,21 @@ resource_fields: - field_name: mimetype label: Media type - # TODO: get from format + validators: if_empty_guess_format ignore_missing unicode_safe - field_name: compress_format label: Compress format - # TODO: media type validator - field_name: package_format label: Package format - # TODO: media type validator - field_name: size label: Size - # TODO: number validator / snippet + validators: ignore_missing int_validator + form_snippet: number.html - field_name: hash label: Hash - # TODO: generate for uploads? - field_name: hash_algorithm label: Hash Algorithm diff --git a/ckanext/dcat/templates/scheming/form_snippets/number.html b/ckanext/dcat/templates/scheming/form_snippets/number.html new file mode 100644 index 00000000..476de3b7 --- /dev/null +++ b/ckanext/dcat/templates/scheming/form_snippets/number.html @@ -0,0 +1,16 @@ +{% import 'macros/form.html' as form %} +{% call form.input( + field.field_name, + id='field-' + field.field_name, + label=h.scheming_language_text(field.label), + placeholder=h.scheming_language_text(field.form_placeholder), + type='number', + value=(data.get(field.field_name) or '').split()[0], + error=errors[field.field_name], + classes=field.classes if 'classes' in field else ['control-medium'], + attrs=dict({"class": "form-control"}, **(field.get('form_attrs', {}))), + is_required=h.scheming_field_required(field) + ) +%} + {%- snippet 'scheming/form_snippets/help_text.html', field=field -%} +{% endcall %} diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 8b2da04f..2d66d716 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -397,7 +397,9 @@ def test_e2e_ckan_to_dcat(self): URIRef(resource["download_url"]), ) - assert self._triple(g, distribution_ref, DCAT.byteSize, float(resource['size']), XSD.decimal) + assert self._triple( + g, distribution_ref, DCAT.byteSize, float(resource["size"]), XSD.decimal + ) # Checksum checksum = self._triple(g, distribution_ref, SPDX.checksum, None)[2] assert checksum @@ -539,6 +541,35 @@ def test_legacy_fields(self): assert self._triple(g, publisher[0][2], FOAF.name, "Test Publisher") +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" +) +@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingValidators: + def test_mimetype_is_guessed(self): + dataset_dict = { + "name": "test-dataset-2", + "title": "Test DCAT dataset 2", + "notes": "Lorem ipsum", + "resources": [ + {"url": "https://example.org/data.csv"}, + {"url": "https://example.org/report.pdf"}, + ], + } + + dataset = call_action("package_create", **dataset_dict) + + assert sorted([r["mimetype"] for r in dataset["resources"]]) == [ + "application/pdf", + "text/csv", + ] + + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( From 1790404d81bdbadd5168b6407f09d107abbc859c Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 4 Jun 2024 15:44:48 +0200 Subject: [PATCH 31/52] [#56] Fix spatial_resolution validators --- ckanext/dcat/tests/test_scheming_support.py | 15 ++++++---- ckanext/dcat/tests/utils.py | 32 ++++++++++++--------- ckanext/dcat/validators.py | 20 ++++++++----- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 2d66d716..d9ec2efc 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -241,11 +241,12 @@ def test_e2e_ckan_to_dcat(self): == dataset["applicable_legislation"] ) - # TODO: enable after validator - # assert ( - # self._triples_list_values(g, dataset_ref, DCAT.spatialResolutionInMeters) - # == dataset["spatial_resolution_in_meters"] - # ) + assert ( + self._triples_list_python_values( + g, dataset_ref, DCAT.spatialResolutionInMeters + ) + == dataset["spatial_resolution_in_meters"] + ) # Repeating subfields @@ -647,6 +648,10 @@ def test_e2e_dcat_to_ckan(self): "P1D", "PT15M", ] + assert sorted(dataset["spatial_resolution_in_meters"]) == [ + 1.5, + 2.0, + ] assert sorted(dataset["is_referenced_by"]) == [ "https://doi.org/10.1038/sdata.2018.22", "test_isreferencedby", diff --git a/ckanext/dcat/tests/utils.py b/ckanext/dcat/tests/utils.py index 8c0e8a18..53618366 100644 --- a/ckanext/dcat/tests/utils.py +++ b/ckanext/dcat/tests/utils.py @@ -4,32 +4,32 @@ class BaseParseTest(object): - def _extras(self, dataset): extras = {} - for extra in dataset.get('extras'): - extras[extra['key']] = extra['value'] + for extra in dataset.get("extras"): + extras[extra["key"]] = extra["value"] return extras def _get_file_contents(self, file_name): - path = os.path.join(os.path.dirname(__file__), - '..', '..', '..', 'examples', - file_name) - with open(path, 'r') as f: + path = os.path.join( + os.path.dirname(__file__), "..", "..", "..", "examples", file_name + ) + with open(path, "r") as f: return f.read() class BaseSerializeTest(object): - def _extras(self, dataset): extras = {} - for extra in dataset.get('extras'): - extras[extra['key']] = extra['value'] + for extra in dataset.get("extras"): + extras[extra["key"]] = extra["value"] return extras def _triples(self, graph, subject, predicate, _object, data_type=None): - if not (isinstance(_object, URIRef) or isinstance(_object, BNode) or _object is None): + if not ( + isinstance(_object, URIRef) or isinstance(_object, BNode) or _object is None + ): if data_type: _object = Literal(_object, datatype=data_type) else: @@ -42,7 +42,13 @@ def _triple(self, graph, subject, predicate, _object, data_type=None): return triples[0] if triples else None def _triples_list_values(self, graph, subject, predicate): - return [str(t[2]) for t in graph.triples((subject, predicate, None))] + return [str(t[2]) for t in graph.triples((subject, predicate, None))] + + def _triples_list_python_values(self, graph, subject, predicate): + return [ + t[2].value if isinstance(t[2], Literal) else str(t[2]) + for t in graph.triples((subject, predicate, None)) + ] def _get_typed_list(self, list, datatype): """ returns the list with the given rdf type """ @@ -51,6 +57,6 @@ def _get_typed_list(self, list, datatype): def _get_dict_from_list(self, dict_list, key, value): """ returns the dict with the given key-value """ for dict in dict_list: - if(dict.get(key) == value): + if dict.get(key) == value: return dict return None diff --git a/ckanext/dcat/validators.py b/ckanext/dcat/validators.py index 9e3e110a..4db20cdb 100644 --- a/ckanext/dcat/validators.py +++ b/ckanext/dcat/validators.py @@ -1,4 +1,3 @@ -import numbers import json from ckantoolkit import ( @@ -34,14 +33,21 @@ def _scheming_multiple_number(key, data, errors, context): return value = data[key] - # 1. list of strings or 2. single string if value is not missing: + if not isinstance(value, list): - try: - value = [float(value)] - except ValueError: - errors[key].append(_("expecting list of numbers")) - raise StopOnError + if isinstance(value, str) and value.startswith("["): + try: + value = json.loads(value) + except ValueError: + errors[key].append(_("Could not parse value")) + raise StopOnError + else: + try: + value = [float(value)] + except ValueError: + errors[key].append(_("expecting list of numbers")) + raise StopOnError out = [] for element in value: From 73523d6abca139d21b518076f525eea3304e8157 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 12:14:27 +0200 Subject: [PATCH 32/52] [#56] Don't mess with field keys if using scheming --- ckanext/dcat/plugins/__init__.py | 33 +++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 23e1424d..087eb2f7 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -30,6 +30,19 @@ I18N_DIR = os.path.join(HERE, u"../i18n") +def _get_dataset_schema(dataset_type="dataset"): + schema = None + try: + schema_show = p.toolkit.get_action("scheming_dataset_schema_show") + try: + schema = schema_show({}, {"type": dataset_type}) + except p.toolkit.ObjectNotFound: + pass + except KeyError: + pass + return schema + + class DCATPlugin(p.SingletonPlugin, DefaultTranslation): p.implements(p.IConfigurer, inherit=True) @@ -120,8 +133,15 @@ def before_index(self, dataset_dict): # CKAN >= 2.10 hooks def after_dataset_show(self, context, data_dict): + schema = _get_dataset_schema(data_dict["type"]) # check if config is enabled to translate keys (default: True) - if not p.toolkit.asbool(config.get(TRANSLATE_KEYS_CONFIG, True)): + # skip if scheming is enabled, as this will be handled there + translate_keys = ( + p.toolkit.asbool(config.get(TRANSLATE_KEYS_CONFIG, True)) + and not schema + ) + + if not translate_keys: return data_dict if context.get('for_view'): @@ -143,16 +163,7 @@ def set_titles(object_dict): return data_dict def before_dataset_index(self, dataset_dict): - schema = None - try: - schema_show = p.toolkit.get_action("scheming_dataset_schema_show") - try: - schema = schema_show({}, {"type": dataset_dict["type"]}) - except p.toolkit.ObjectNotFound: - pass - except KeyError: - pass - + schema = _get_dataset_schema(dataset_dict["type"]) spatial = None if schema: for field in schema['dataset_fields']: From d456c00d1e669505870f43c91aa754dc6f4dc3d9 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 12:16:20 +0200 Subject: [PATCH 33/52] [#56] Display snippets for file size, markdown --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 4 ++++ .../dcat/templates/scheming/display_snippets/file_size.html | 1 + 2 files changed, 5 insertions(+) create mode 100644 ckanext/dcat/templates/scheming/display_snippets/file_size.html diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index e00426a1..5ff95fab 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -153,11 +153,13 @@ dataset_fields: label: Access rights validators: ignore_missing unicode_safe form_snippet: markdown.html + display_snippet: markdown.html - field_name: version_notes label: Version notes validators: ignore_missing unicode_safe form_snippet: markdown.html + display_snippet: markdown.html - field_name: alternate_identifier label: Alternate identifier @@ -240,6 +242,7 @@ resource_fields: label: Size validators: ignore_missing int_validator form_snippet: number.html + display_snippet: file_size.html - field_name: hash label: Hash @@ -250,6 +253,7 @@ resource_fields: - field_name: rights label: Rights form_snippet: markdown.html + display_snippet: markdown.html form_placeholder: Some statement about the rights associated with the resource - field_name: availability diff --git a/ckanext/dcat/templates/scheming/display_snippets/file_size.html b/ckanext/dcat/templates/scheming/display_snippets/file_size.html new file mode 100644 index 00000000..ca7e5057 --- /dev/null +++ b/ckanext/dcat/templates/scheming/display_snippets/file_size.html @@ -0,0 +1 @@ +{{ h.localised_filesize(data[field.field_name]) }} From b1e17183d40d93c3cb35ea5cc5fc82e5529c12b2 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 12:32:26 +0200 Subject: [PATCH 34/52] [#56] Common preset for DCAT date-based fields Support at the validator level for year, year-month, date and datetime values, which are correctly typed in the RDF serialization. At the UI level a date input is used by default as it was difficult to provide one that supported all inputs. --- ckanext/dcat/profiles/base.py | 30 +++++--- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 13 ++-- ckanext/dcat/schemas/presets.yaml | 12 ++++ .../scheming/display_snippets/dcat_date.html | 4 ++ .../test_euro_dcatap_profile_serialize.py | 24 +++++++ ckanext/dcat/tests/test_scheming_support.py | 72 +++++++++++++++++-- ckanext/dcat/validators.py | 54 ++++++++++++++ 7 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 ckanext/dcat/schemas/presets.yaml create mode 100644 ckanext/dcat/templates/scheming/display_snippets/dcat_date.html diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 36b44c00..a2eddd71 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -11,6 +11,7 @@ from ckan.model.license import LicenseRegister from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS +from ckanext.dcat.validators import is_year, is_year_month, is_date DCT = Namespace("http://purl.org/dc/terms/") DCAT = Namespace("http://www.w3.org/ns/dcat#") @@ -934,22 +935,31 @@ def _add_date_triple(self, subject, predicate, value, _type=Literal): """ Adds a new triple with a date object - Dates are parsed using dateutil, and if the date obtained is correct, - added to the graph as an XSD.dateTime value. + If the value is one of xsd:gYear, xsd:gYearMonth or xsd:date. If not + the value will be parsed using dateutil, and if the date obtained is correct, + added to the graph as an xsd:dateTime value. If there are parsing errors, the literal string value is added. """ if not value: return - try: - default_datetime = datetime.datetime(1, 1, 1, 0, 0, 0) - _date = parse_date(value, default=default_datetime) - self.g.add( - (subject, predicate, _type(_date.isoformat(), datatype=XSD.dateTime)) - ) - except ValueError: - self.g.add((subject, predicate, _type(value))) + if is_year(value): + self.g.add((subject, predicate, _type(value, datatype=XSD.gYear))) + elif is_year_month(value): + self.g.add((subject, predicate, _type(value, datatype=XSD.gYearMonth))) + elif is_date(value): + self.g.add((subject, predicate, _type(value, datatype=XSD.date))) + else: + try: + default_datetime = datetime.datetime(1, 1, 1, 0, 0, 0) + _date = parse_date(value, default=default_datetime) + + self.g.add( + (subject, predicate, _type(_date.isoformat(), datatype=XSD.dateTime)) + ) + except ValueError: + self.g.add((subject, predicate, _type(value))) def _last_catalog_modification(self): """ diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 5ff95fab..350e359c 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -88,12 +88,13 @@ dataset_fields: # Note: this will fall back to metadata_created if not present - field_name: issued label: Release date - # TODO: dcat_date preset + preset: dcat_date + # Note: this will fall back to metadata_modified if not present - field_name: modified label: Modification date - # TODO: dcat_date preset + preset: dcat_date - field_name: identifier label: Identifier @@ -114,11 +115,11 @@ dataset_fields: - field_name: start label: Start - # TODO: dcat_date preset + preset: dcat_date - field_name: end label: End - # TODO: dcat_date preset + preset: dcat_date - field_name: temporal_resolution label: Temporal resolution @@ -275,11 +276,11 @@ resource_fields: - field_name: issued label: Release date - # TODO: dcat_date preset + preset: dcat_date - field_name: modified label: Modification date - # TODO: dcat_date preset + preset: dcat_date - field_name: language label: Language diff --git a/ckanext/dcat/schemas/presets.yaml b/ckanext/dcat/schemas/presets.yaml new file mode 100644 index 00000000..88be7b0c --- /dev/null +++ b/ckanext/dcat/schemas/presets.yaml @@ -0,0 +1,12 @@ +scheming_presets_version: 1 +about: Presets for the ckanext-dcat extension +about_url": "http://github.com/ckan/ckanext-dcat" + +presets: + +- preset_name: dcat_date + values: + # Note: use datetime.html or datetime_tz.html if you want to inclue an input for time + form_snippet: date.html + display_snippet: dcat_date.html + validators: ignore_missing dcat_date convert_to_json_if_datetime diff --git a/ckanext/dcat/templates/scheming/display_snippets/dcat_date.html b/ckanext/dcat/templates/scheming/display_snippets/dcat_date.html new file mode 100644 index 00000000..3e7f7ec6 --- /dev/null +++ b/ckanext/dcat/templates/scheming/display_snippets/dcat_date.html @@ -0,0 +1,4 @@ +{{ h.render_datetime(data[field.field_name]) }} + +{# Use the following if you want to include the time as well #} +{# h.render_datetime(data[field.field_name], with_hours=True) #} diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index 1a389df4..bfed0160 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -1128,6 +1128,30 @@ def test_hash_algorithm_not_uri(self): assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary') assert self._triple(g, checksum, SPDX.algorithm, resource['hash_algorithm']) + @pytest.mark.parametrize("value,data_type", [ + ("2024", XSD.gYear), + ("2024-05", XSD.gYearMonth), + ("2024-05-31", XSD.date), + ("2024-05-31T00:00:00", XSD.dateTime), + ("2024-05-31T12:30:01", XSD.dateTime), + ("2024-05-31T12:30:01.451243", XSD.dateTime), + ]) + def test_dates_data_types(self, value, data_type): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'issued': value, + } + + s = RDFSerializer(profiles=['euro_dcat_ap']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert str(self._triple(g, dataset_ref, DCT.issued, None)[2]) == value + assert self._triple(g, dataset_ref, DCT.issued, None)[2].datatype == data_type + class TestEuroDCATAPProfileSerializeCatalog(BaseSerializeTest): diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index d9ec2efc..38e4da4d 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -35,7 +35,10 @@ @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" ) -@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -547,7 +550,10 @@ def test_legacy_fields(self): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" ) -@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -576,7 +582,10 @@ def test_mimetype_is_guessed(self): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" ) -@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -733,7 +742,10 @@ def test_e2e_dcat_to_ckan(self): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" ) -@pytest.mark.ckan_config("scheming.presets", "ckanext.scheming:presets.json") +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -801,3 +813,55 @@ def test_spatial_field(self): assert search_dict["spatial"] == json.dumps( dataset_dict["spatial_coverage"][0]["centroid"] ) + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingPresets: + def test_dcat_date(self): + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Some notes", + "issued": "2024", + "modified": "2024-10", + "temporal_coverage": [ + {"start": "1905-03-01T10:07:31.182680", "end": "2013-01-05"}, + {"start": "2024-04-10T10:07:31", "end": "2024-05-29"}, + ], + } + + dataset = call_action("package_create", **dataset_dict) + + # Year + assert dataset["issued"] == dataset_dict["issued"] + + # Year-month + assert dataset["modified"] == dataset_dict["modified"] + + # Date + assert ( + dataset["temporal_coverage"][0]["end"] + == dataset_dict["temporal_coverage"][0]["end"] + ) + + # Datetime + assert ( + dataset["temporal_coverage"][0]["start"] + == dataset_dict["temporal_coverage"][0]["start"] + ) + assert ( + dataset["temporal_coverage"][1]["start"] + == dataset_dict["temporal_coverage"][1]["start"] + ) diff --git a/ckanext/dcat/validators.py b/ckanext/dcat/validators.py index 4db20cdb..6dae17af 100644 --- a/ckanext/dcat/validators.py +++ b/ckanext/dcat/validators.py @@ -1,12 +1,65 @@ +import datetime import json +import re from ckantoolkit import ( missing, StopOnError, + get_validator, + Invalid, _, ) from ckanext.scheming.validation import scheming_validator +# https://www.w3.org/TR/xmlschema11-2/#gYear +regexp_xsd_year = re.compile( + "-?([1-9][0-9]{3,}|0[0-9]{3})(Z|(\+|-)((0[0-9]|1[0-3]):[0-5][0-9]|14:00))?" +) + +# https://www.w3.org/TR/xmlschema11-2/#gYearMonth +regexp_xsd_year_month = re.compile( + "-?([1-9][0-9]{3,}|0[0-9]{3})-(0[1-9]|1[0-2])(Z|(\+|-)((0[0-9]|1[0-3]):[0-5][0-9]|14:00))?" +) + +regexp_xsd_date = re.compile( + "-?([1-9][0-9]{3,}|0[0-9]{3})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])(Z|(\+|-)((0[0-9]|1[0-3]):[0-5][0-9]|14:00))?" +) + + +def is_year(value): + return regexp_xsd_year.fullmatch(value) + + +def is_year_month(value): + return regexp_xsd_year_month.fullmatch(value) + + +def is_date(value): + return regexp_xsd_date.fullmatch(value) + + +def dcat_date(key, data, errors, context): + value = data[key] + + scheming_isodatetime = get_validator("scheming_isodatetime") + + if isinstance(value, datetime.datetime): + return + + if is_year(value) or is_year_month(value) or is_date(value): + return + + try: + scheming_isodatetime({}, {})(key, data, errors, context) + except Invalid: + raise Invalid( + _( + "Date format incorrect. Supported formats are YYYY, YYYY-MM, YYYY-MM-DD and YYYY-MM-DDTHH:MM:SS" + ) + ) + + return value + @scheming_validator def scheming_multiple_number(field, schema): @@ -77,4 +130,5 @@ def _scheming_multiple_number(key, data, errors, context): dcat_validators = { "scheming_multiple_number": scheming_multiple_number, + "dcat_date": dcat_date, } From 209fda56206b245ced3eee149dd843eb556166b7 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 12:56:59 +0200 Subject: [PATCH 35/52] [#56] Fix dates tests --- .../test_euro_dcatap_2_profile_serialize.py | 4 ++-- .../test_euro_dcatap_profile_serialize.py | 4 ++-- ckanext/dcat/tests/test_scheming_support.py | 24 +++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py index 114dc602..abf80363 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py @@ -298,13 +298,13 @@ def test_temporal(self): for predicate in [SCHEMA.startDate, DCAT.startDate]: triples = [] for temporal_obj in temporal_obj_list: - triples.extend(self._triples(g, temporal_obj, predicate, parse_date(extras['temporal_start']).isoformat(), XSD.dateTime)) + triples.extend(self._triples(g, temporal_obj, predicate, extras['temporal_start'], XSD.dateTime)) assert len(triples) == 1 for predicate in [SCHEMA.endDate, DCAT.endDate]: triples = [] for temporal_obj in temporal_obj_list: - triples.extend(self._triples(g, temporal_obj, predicate, parse_date(extras['temporal_end']).isoformat(), XSD.dateTime)) + triples.extend(self._triples(g, temporal_obj, predicate, extras['temporal_end'], XSD.date)) assert len(triples) == 1 def test_high_value_datasets(self): diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index bfed0160..edec0c5a 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -503,8 +503,8 @@ def test_temporal(self): assert temporal assert self._triple(g, temporal, RDF.type, DCT.PeriodOfTime) - assert self._triple(g, temporal, SCHEMA.startDate, parse_date(extras['temporal_start']).isoformat(), XSD.dateTime) - assert self._triple(g, temporal, SCHEMA.endDate, parse_date(extras['temporal_end']).isoformat(), XSD.dateTime) + assert self._triple(g, temporal, SCHEMA.startDate, extras['temporal_start'], XSD.dateTime) + assert self._triple(g, temporal, SCHEMA.endDate, extras['temporal_end'], XSD.date) def test_spatial(self): dataset = { diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 38e4da4d..9fd124a5 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -201,15 +201,15 @@ def test_e2e_ckan_to_dcat(self): g, dataset_ref, DCT.issued, - dataset["issued"] + "T00:00:00", - data_type=XSD.dateTime, + dataset["issued"], + data_type=XSD.date, ) assert self._triple( g, dataset_ref, DCT.modified, - dataset["modified"] + "T00:00:00", - data_type=XSD.dateTime, + dataset["modified"], + data_type=XSD.date, ) # List fields @@ -307,29 +307,29 @@ def test_e2e_ckan_to_dcat(self): g, temporal[0][2], SCHEMA.startDate, - dataset_dict["temporal_coverage"][0]["start"] + "T00:00:00", - data_type=XSD.dateTime, + dataset_dict["temporal_coverage"][0]["start"], + data_type=XSD.date, ) assert self._triple( g, temporal[0][2], SCHEMA.endDate, - dataset_dict["temporal_coverage"][0]["end"] + "T00:00:00", - data_type=XSD.dateTime, + dataset_dict["temporal_coverage"][0]["end"], + data_type=XSD.date, ) assert self._triple( g, temporal[1][2], SCHEMA.startDate, - dataset_dict["temporal_coverage"][1]["start"] + "T00:00:00", - data_type=XSD.dateTime, + dataset_dict["temporal_coverage"][1]["start"], + data_type=XSD.date, ) assert self._triple( g, temporal[1][2], SCHEMA.endDate, - dataset_dict["temporal_coverage"][1]["end"] + "T00:00:00", - data_type=XSD.dateTime, + dataset_dict["temporal_coverage"][1]["end"], + data_type=XSD.date, ) spatial = [t for t in g.triples((dataset_ref, DCT.spatial, None))] From 634ff52819d09096601010ada578ba96a7390b96 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 13:12:54 +0200 Subject: [PATCH 36/52] [#56] Fix number form snippet --- ckanext/dcat/templates/scheming/form_snippets/number.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/templates/scheming/form_snippets/number.html b/ckanext/dcat/templates/scheming/form_snippets/number.html index 476de3b7..bed99336 100644 --- a/ckanext/dcat/templates/scheming/form_snippets/number.html +++ b/ckanext/dcat/templates/scheming/form_snippets/number.html @@ -5,7 +5,7 @@ label=h.scheming_language_text(field.label), placeholder=h.scheming_language_text(field.form_placeholder), type='number', - value=(data.get(field.field_name) or '').split()[0], + value=data.get(field.field_name), error=errors[field.field_name], classes=field.classes if 'classes' in field else ['control-medium'], attrs=dict({"class": "form-control"}, **(field.get('form_attrs', {}))), From 8b78139dc2f004ffa72590e6d39846b336daf687 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 14:58:13 +0200 Subject: [PATCH 37/52] [#56] Help texts for all fields in the schema Mostly taken from the DCAT-AP 2.1 spec doc, adapted for CKAN --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 100 +++++++++++++++++++------- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 350e359c..d11b95d9 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -9,7 +9,7 @@ dataset_fields: label: Title preset: title required: true - form_placeholder: eg. A descriptive title + help_text: A descriptive title for the dataset. - field_name: name label: URL @@ -20,12 +20,13 @@ dataset_fields: label: Description required: true form_snippet: markdown.html - form_placeholder: eg. Some useful notes about the data + help_text: A free-text account of the dataset. - field_name: tag_string label: Keywords preset: tag_string_autocomplete form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. - field_name: contact label: Contact points @@ -41,6 +42,7 @@ dataset_fields: - field_name: email label: Email display_snippet: email.html + help_text: Contact information for enquiries about the dataset. - field_name: publisher label: Publisher @@ -64,49 +66,64 @@ dataset_fields: - field_name: type label: Type + help_text: Entity responsible for making the dataset available. - field_name: license_id label: License form_snippet: license.html - help_text: License definitions and additional information can be found at http://opendefinition.org/ + help_text: License definitions and additional information can be found at http://opendefinition.org/. - field_name: owner_org label: Organization preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. - field_name: url - label: Source + label: Landing page form_placeholder: http://example.com/dataset.json - display_property: foaf:homepage display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. -- field_name: version - label: Version - validators: ignore_missing unicode_safe package_version_validator - form_placeholder: '1.0' - -# Note: this will fall back to metadata_created if not present + # Note: this will fall back to metadata_created if not present - field_name: issued label: Release date preset: dcat_date + help_text: Date of publication of the dataset. - -# Note: this will fall back to metadata_modified if not present + # Note: this will fall back to metadata_modified if not present - field_name: modified label: Modification date preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. +- field_name: version_notes + label: Version notes + validators: ignore_missing unicode_safe + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset - field_name: identifier label: Identifier + help_text: A unique identifier of the dataset. - field_name: frequency label: Frequency + help_text: The frequency at which dataset is published. - field_name: provenance label: Provenance + help_text: A statement about the lineage of the dataset. - field_name: dcat_type label: Type + help_text: The type of the dataset. # TODO: controlled vocabulary? - field_name: temporal_coverage @@ -120,11 +137,13 @@ dataset_fields: - field_name: end label: End preset: dcat_date + help_text: The temporal period or periods the dataset covers. - field_name: temporal_resolution label: Temporal resolution preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: Minimum time period resolvable in the dataset. - field_name: spatial_coverage label: Spatial coverage @@ -144,59 +163,63 @@ dataset_fields: - field_name: centroid label: Centroid + help_text: A geographic region that is covered by the dataset. - field_name: spatial_resolution_in_meters label: Spatial resolution in meters preset: multiple_text validators: ignore_missing scheming_multiple_number + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. - field_name: access_rights label: Access rights validators: ignore_missing unicode_safe form_snippet: markdown.html display_snippet: markdown.html - -- field_name: version_notes - label: Version notes - validators: ignore_missing unicode_safe - form_snippet: markdown.html - display_snippet: markdown.html + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. - field_name: alternate_identifier - label: Alternate identifier + label: Other identifier preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. - field_name: theme label: Theme preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. - field_name: language label: Language preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. # TODO: language form snippet / validator / graph - field_name: documentation label: Documentation preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. - field_name: conforms_to label: Conforms to preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. - field_name: is_referenced_by label: Is referenced by preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. - field_name: applicable_legislation label: Applicable legislation preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. #- field_name: hvd_category # label: HVD Category @@ -204,12 +227,13 @@ dataset_fields: # validators: ignore_missing scheming_multiple_text # TODO: implement separately as part of wider HVD support - # Note: if not provided, this will be autogenerated - field_name: uri label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). # TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# resource_fields: - field_name: url @@ -218,89 +242,109 @@ resource_fields: - field_name: name label: Name - form_placeholder: eg. January 2011 Gold Prices + form_placeholder: + help_text: A descriptive title for the resource. - field_name: description label: Description form_snippet: markdown.html - form_placeholder: Some useful notes about the data + help_text: A free-text account of the resource. - field_name: format label: Format preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. - field_name: mimetype label: Media type validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. - field_name: compress_format label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. - field_name: package_format label: Package format + help_text: The format of the file in which one or more data files are grouped together. - field_name: size label: Size validators: ignore_missing int_validator form_snippet: number.html display_snippet: file_size.html + help_text: File size in bytes - field_name: hash label: Hash + help_text: Checksum of the downloaded file. - field_name: hash_algorithm label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. - field_name: rights label: Rights form_snippet: markdown.html display_snippet: markdown.html - form_placeholder: Some statement about the rights associated with the resource + help_text: Some statement about the rights associated with the resource. - field_name: availability label: Availability + help_text: Indicates how long it is planned to keep the resource available. - field_name: status label: Status + help_text: The status of the distribution in the context of maturity lifecycle. + # TODO: choices - field_name: license label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. -# Note: this falls back to the standard resource url field + # Note: this falls back to the standard resource url field - field_name: access_url label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). -# Note: this falls back to the standard resource url field + # Note: this falls back to the standard resource url field - field_name: download_url label: Download URL + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). - field_name: issued label: Release date preset: dcat_date + help_text: Date of publication of the resource. - field_name: modified label: Modification date preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. - field_name: language label: Language preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. - field_name: documentation label: Documentation preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. - field_name: conforms_to label: Conforms to preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. - field_name: applicable_legislation label: Applicable legislation preset: multiple_text validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. - field_name: access_services label: Access services @@ -316,7 +360,9 @@ resource_fields: - field_name: endpoint_url label: Endpoint URL preset: multiple_text + help_text: A data service that gives access to the resource. -# Note: if not provided, this will be autogenerated + # Note: if not provided, this will be autogenerated - field_name: uri label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). From 15b0cc13c537b2d23c6e6403d94c87a8164fea81 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 6 Jun 2024 17:10:19 +0200 Subject: [PATCH 38/52] [#56] Use choices for resource status --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 13 +++++++++++-- ckanext/dcat/tests/test_scheming_support.py | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index d11b95d9..051890ee 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -295,8 +295,17 @@ resource_fields: - field_name: status label: Status - help_text: The status of the distribution in the context of maturity lifecycle. - # TODO: choices + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. - field_name: license label: License diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 9fd124a5..c4e7738e 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -146,7 +146,7 @@ def test_e2e_ckan_to_dcat(self): "size": 12323, "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", - "status": "published", + "status": "http://purl.org/adms/status/Completed", "access_url": "https://example.com/data.csv", "download_url": "https://example.com/data.csv", "issued": "2024-05-01T01:20:33", @@ -369,7 +369,7 @@ def test_e2e_ckan_to_dcat(self): # Resources: standard fields assert self._triple(g, distribution_ref, DCT.rights, resource["rights"]) - assert self._triple(g, distribution_ref, ADMS.status, resource["status"]) + assert self._triple(g, distribution_ref, ADMS.status, URIRef(resource["status"])) assert self._triple( g, distribution_ref, From 602d505f12cf7b82447a3b7a45bac015cab13c7b Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 10 Jun 2024 17:07:59 +0200 Subject: [PATCH 39/52] [#56] Create a full and a slimmed down schema version --- ...dcat_ap_2.1.yaml => dcat_ap_2.1_full.yaml} | 0 .../dcat/schemas/dcat_ap_2.1_recommended.yaml | 147 ++++++++++++++++++ ckanext/dcat/tests/test_scheming_support.py | 10 +- 3 files changed, 152 insertions(+), 5 deletions(-) rename ckanext/dcat/schemas/{dcat_ap_2.1.yaml => dcat_ap_2.1_full.yaml} (100%) create mode 100644 ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml similarity index 100% rename from ckanext/dcat/schemas/dcat_ap_2.1.yaml rename to ckanext/dcat/schemas/dcat_ap_2.1_full.yaml diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml new file mode 100644 index 00000000..a8b5bf1e --- /dev/null +++ b/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml @@ -0,0 +1,147 @@ +scheming_version: 2 +dataset_type: dataset +about: A reimplementation of the default CKAN dataset schema +about_url: http://github.com/ckan/ckanext-dcat + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + help_text: A free-text account of the dataset. + +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + help_text: Entity responsible for making the dataset available. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: + help_text: A descriptive title for the resource. + +- field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index c4e7738e..a77bacb0 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -33,7 +33,7 @@ @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", @@ -548,7 +548,7 @@ def test_legacy_fields(self): @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", @@ -580,7 +580,7 @@ def test_mimetype_is_guessed(self): @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", @@ -740,7 +740,7 @@ def test_e2e_dcat_to_ckan(self): @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", @@ -818,7 +818,7 @@ def test_spatial_field(self): @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", From 614e23b94c52d6bacbd76a5708db2c5186253a4c Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 10 Jun 2024 22:17:23 +0200 Subject: [PATCH 40/52] [#56] Update README --- README.md | 228 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 202 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index c79fc710..dd7a6d01 100644 --- a/README.md +++ b/README.md @@ -5,51 +5,66 @@ [![Code Coverage](http://codecov.io/github/ckan/ckanext-dcat/coverage.svg?branch=master)](http://codecov.io/github/ckan/ckanext-dcat?branch=master) -This extension provides plugins that allow CKAN to expose and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: +This extension provides plugins that allow CKAN to expose its metadata and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: [http://www.w3.org/TR/vocab-dcat](http://www.w3.org/TR/vocab-dcat) It also offers other features related to Semantic Data like exposing the necessary markup to get your datasets indexed in [Google Dataset Search](https://toolbox.google.com/datasetsearch). +Check the [overview](#overview) section for a summary of the available features. + ## Contents + + - [Overview](#overview) - [Installation](#installation) +- [Schemas](#schemas) + * [Compatibility with existing profiles](#compatibility-with-existing-profiles) - [RDF DCAT endpoints](#rdf-dcat-endpoints) - - [Dataset endpoints](#dataset-endpoints) - - [Catalog endpoint](#catalog-endpoint) - - [URIs](#uris) - - [Content negotiation](#content-negotiation) + * [Dataset endpoints](#dataset-endpoints) + * [Catalog endpoint](#catalog-endpoint) + * [URIs](#uris) + * [Content negotiation](#content-negotiation) - [RDF DCAT harvester](#rdf-dcat-harvester) - - [Maximum file size](#maximum-file-size) - - [Transitive harvesting](#transitive-harvesting) - - [Extending the RDF harvester](#extending-the-rdf-harvester) + * [Maximum file size](#maximum-file-size) + * [Transitive harvesting](#transitive-harvesting) + * [Extending the RDF harvester](#extending-the-rdf-harvester) - [JSON DCAT harvester](#json-dcat-harvester) - [RDF DCAT to CKAN dataset mapping](#rdf-dcat-to-ckan-dataset-mapping) + * [Custom fields](#custom-fields) + * [URIs](#uris-1) + * [Lists](#lists) + * [Contact points and Publisher](#contact-points-and-publisher) + * [Spatial coverage](#spatial-coverage) + * [Licenses](#licenses) - [RDF DCAT Parser](#rdf-dcat-parser) - [RDF DCAT Serializer](#rdf-dcat-serializer) + * [Inherit license from the dataset as fallback in distributions](#inherit-license-from-the-dataset-as-fallback-in-distributions) - [Profiles](#profiles) - - [Writing custom profiles](#writing-custom-profiles) - - [Command line interface](#command-line-interface) - - [Compatibility mode](#compatibility-mode) + * [Writing custom profiles](#writing-custom-profiles) + * [Command line interface](#command-line-interface) + * [Compatibility mode](#compatibility-mode) - [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated) - [Translation of fields](#translation-of-fields) -- [Structured Data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) +- [Structured data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) - [CLI](#cli) - [Running the Tests](#running-the-tests) - [Releases](#releases) - [Acknowledgements](#acknowledgements) - [Copying and License](#copying-and-license) -## Overview + -With the emergence of Open Data initiatives around the world, the need to share metadata across different catalogs has became more evident. Sites like [data.europa.eu](https://data.europa.eu/en) aggregate datasets from different portals, and there has been a growing demand to provide a clear and standard interface to allow incorporating metadata into them automatically. +## Overview -There is growing consensus around [DCAT](http://www.w3.org/TR/vocab-dcat) being the right way forward, but actual implementations are needed. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. +[DCAT](http://www.w3.org/TR/vocab-dcat) has become the basis for many metadata sharing standards, like DCAT-AP and DCAT-US for data portals in Europe and the USA respectively. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. In terms of CKAN features, this extension offers: +* [Pre-built CKAN schemas](#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the -box DCAT support in data portals. + * [RDF DCAT Endpoints](#rdf-dcat-endpoints) that expose the catalog's datasets in different RDF serializations (`dcat` plugin). * An [RDF Harvester](#rdf-dcat-harvester) that allows importing RDF serializations from other catalogs to create CKAN datasets (`dcat_rdf_harvester` plugin). @@ -69,20 +84,66 @@ These are implemented internally using: ## Installation -1. Install ckanext-harvest ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)) (Only if you want to use the RDF harvester) -2. Install the extension on your virtualenv: +1. Install the extension on your virtualenv: (pyenv) $ pip install -e git+https://github.com/ckan/ckanext-dcat.git#egg=ckanext-dcat -3. Install the extension requirements: +2. Install the extension requirements: (pyenv) $ pip install -r ckanext-dcat/requirements.txt -4. Enable the required plugins in your ini file: +3. Enable the required plugins in your ini file: ckan.plugins = dcat dcat_rdf_harvester dcat_json_harvester dcat_json_interface structured_data +4. To use the pre-built schemas, install [ckanext-scheming](https://github.com/ckan/ckanext-scheming): + + pip install -e "git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming" + +Check the [Schemas](#schemas) section for extra configuration needed. + +Optionally, if you want to use the RDF harvester, install ckanext-harvest as well ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)). + +## Schemas + +The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas that enable DCAT support. These include a schema definition file (located in `ckanext/dcat/schemas`) plus extra validators and other custom logic that integrates the metadata modifications with the RDF DCAT [Parsers](#rdf-dcat-parser) and [Serializers](#rdf-dcat-serializer) and other CKAN features and extensions. + +There are the following schemas currently included with the extension: + +* *dcat_ap_2.1_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. +* *dcat_ap_2.1_full.yaml*: Includes the most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. + +Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. + +In any case, the schema file used should be defined in the configuration file, alongside these configuration options: + + # Make sure to add scheming_datasets after the dcat plugin + ckan.plugins = activity dcat [...] scheming_datasets + + # Point to one of the default or your own version of the schema file + scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml + + # Include the dcat presets as well as the standard scheming ones + scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml + + # Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the + # euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) + ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming + +### Compatibility with existing profiles + +Sites using the existing `euro_dcat_ap` and `euro_dcat_ap_2` profiles should not see any change in their +current parsing and serialization functionalities and these profiles will not change their outputs going +forward (unless a bug is being fixed). Sites willing to migrate to a scheming based metadata schema can do +so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. +`ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile +outputs to the expected format by the scheming validators. + +Note that the scheming profile will only affect fields defined in the schema definition file, so sites can start migrating gradually different metadata fields. + + + ## RDF DCAT endpoints By default when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](#profiles). @@ -308,13 +369,15 @@ To enable the JSON harvester, add the `dcat_json_harvester` plugin to your CKAN ## RDF DCAT to CKAN dataset mapping The following table provides a generic mapping between the fields of the `dcat:Dataset` and `dcat:Distribution` classes and -their equivalents on the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link +their equivalents in the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link the DCAT publisher property with a CKAN dataset author, maintainer or organization, as the link between them is not straight-forward and may depend on a particular instance needs. When mapping from CKAN metadata to DCAT though, there are in some cases fallback fields that are used if the default field is not present (see [RDF Serializer](#rdf-dcat-serializer) for more details on this. This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). It depends on the active profile(s) (see [Profiles](#profiles)) which DCAT properties are mapped. +Sites are encouraged to use ckanext-scheming to manage their metadata schema (see [Schemas](#schemas) for all details). This changes in +some cases the way metadata is stored internally and presented at the CKAN API level, but should not affect the RDF DCAT output. | DCAT class | DCAT property | CKAN dataset field | CKAN fallback fields | Stored as | | |-------------------|------------------------|-------------------------------------------|--------------------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -341,7 +404,7 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a | dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | | dcat:Dataset | dct:source | extra:source | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | | dcat:Dataset | adms:sample | extra:sample | | list | See note about lists. It is assumed that these are one or more URIs referring to dcat:Distribution instances | -| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | If the RDF provides them, profiles should store the textual and geometric representation of the location in extra:spatial_text, extra:spatial, extra:spatial_bbox and extra:spatial_centroid respectively | +| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See note about the spatial field | | dcat:Dataset | dct:temporal | extra:temporal_start + extra:temporal_end | | text | None, one or both extras can be present | | dcat:Dataset | dcat:temporalResolution| extra:temporal_resolution | | list | | | dcat:Dataset | dcat:spatialResolutionInMeters| extra:spatial_resolution_in_meters | | list | | @@ -388,8 +451,33 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a *Notes* -* Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. - For instance: +### Custom fields + +Fields marked as `extra:` are stored as free form extras in the `euro_dcat_ap` and `euro_dcat_ap_2` profiles, +but stored as first level custom fields when using the scheming based profile (`euro_dcat_ap_scheming`), i.e: + + ```json + { + "name": "test_dataset_dcat", + "extras": [ + {"key": "version_notes", "value": "Some version notes"} + ] + } + ``` + + vs: + + ```json + { + "name": "test_dataset_dcat", + "version_notes": "Some version notes" + } + ``` + +### URIs + +Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. +For instance: ```xml @@ -456,7 +544,9 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a } ``` -* Lists are stored as a JSON string, eg: +### Lists + +On the legacy profiles, lists are stored as a JSON string, eg: ``` @prefix dcat: . @@ -481,7 +571,56 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a } ``` -* The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. +On the scheming-based ones, these are shown as actual lists: + + ```json + { + "title": "Dataset 1", + "uri": "http://data.some.org/catalog/datasets/1"}, + "language": ["ca", "en", "es"] + "theme": ["Earth Sciences", "http://eurovoc.europa.eu/209065", "http://eurovoc.europa.eu/100142"] + } + ``` +### Contact points and Publisher + +Properties for `dcat:contactPoint` and `dct:publisher` are stored as namespaced extras in the legacy profiles. When using +a scheming-based profile, these are stored as proper objects (and multiple instances are allowed for contact point): + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "extras": [ + {"key":"contact_name","value":"PointofContact"}, + {"key":"contact_email","value":"contact@some.org"} + ], +} +``` + +vs: + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "contact": [ + { + "name": "Point of Contact 1", + "email": "contact1@some.org" + }, + { + "name": "Point of Contact 2", + "email": "contact2@some.org" + }, + ] +} +``` + + +### Spatial coverage + + +The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. - One `dct:spatial` instance, URI only @@ -531,8 +670,45 @@ This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/a ``` +If the RDF provides them, profiles should store the textual and geometric representation of the location in: + +* For legacy profiles in `spatial_text`, `spatial_bbox`, `spatial_centroid` or `spatial` (for any other geometries) extra fields +* For scheming-based profiles in objects in the `spatial_coverage` field, for instance: + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [...] + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [-2.1604, 42.7611], + [-2.0938, 42.7611], + [-2.0938, 42.7931], + [-2.1604, 42.7931], + [-2.1604, 42.7611], + ] + ], + }, + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + } + ] +} +``` + + +### Licenses -* On the CKAN model, license is at the dataset level whereas in DCAT model it +On the CKAN model, license is at the dataset level whereas in DCAT model it is at distributions level. By default the RDF parser will try to find a distribution with a license that matches one of those registered in CKAN and attach this license to the dataset. The first matching distribution's From c11f3c2417cfd99816b80f9127d388ae401d0cb3 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 11 Jun 2024 11:01:08 +0200 Subject: [PATCH 41/52] [#56] README tweaks --- README.md | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index dd7a6d01..f050efdf 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ The extension includes ready to use [ckanext-scheming](https://github.com/ckan/c There are the following schemas currently included with the extension: * *dcat_ap_2.1_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. -* *dcat_ap_2.1_full.yaml*: Includes the most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. +* *dcat_ap_2.1_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. @@ -121,7 +121,7 @@ In any case, the schema file used should be defined in the configuration file, a # Make sure to add scheming_datasets after the dcat plugin ckan.plugins = activity dcat [...] scheming_datasets - # Point to one of the default or your own version of the schema file + # Point to one of the defaults or your own version of the schema file scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml # Include the dcat presets as well as the standard scheming ones @@ -381,59 +381,59 @@ some cases the way metadata is stored internally and presented at the CKAN API l | DCAT class | DCAT property | CKAN dataset field | CKAN fallback fields | Stored as | | |-------------------|------------------------|-------------------------------------------|--------------------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| dcat:Dataset | - | extra:uri | | text | See note about URIs | +| dcat:Dataset | - | extra:uri | | text | See [URIs](#uris-1) | | dcat:Dataset | dct:title | title | | text | | | dcat:Dataset | dct:description | notes | | text | | | dcat:Dataset | dcat:keyword | tags | | text | | -| dcat:Dataset | dcat:theme | extra:theme | | list | See note about lists | +| dcat:Dataset | dcat:theme | extra:theme | | list | See [Lists](#lists) | | dcat:Dataset | dct:identifier | extra:identifier | extra:guid, id | text | | | dcat:Dataset | adms:identifier | extra:alternate_identifier | | text | | | dcat:Dataset | dct:issued | extra:issued | metadata_created | text | | | dcat:Dataset | dct:modified | extra:modified | metadata_modified | text | | | dcat:Dataset | owl:versionInfo | version | extra:dcat_version | text | | | dcat:Dataset | adms:versionNotes | extra:version_notes | | text | | -| dcat:Dataset | dct:language | extra:language | | list | See note about lists | +| dcat:Dataset | dct:language | extra:language | | list | See [Lists](#lists) | | dcat:Dataset | dcat:landingPage | url | | text | | | dcat:Dataset | dct:accrualPeriodicity | extra:frequency | | text | | -| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See note about lists | +| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See [Lists](#lists) | | dcat:Dataset | dct:accessRights | extra:access_rights | | text | | -| dcat:Dataset | foaf:page | extra:documentation | | list | See note about lists | +| dcat:Dataset | foaf:page | extra:documentation | | list | See [Lists](#lists) | | dcat:Dataset | dct:provenance | extra:provenance | | text | | | dcat:Dataset | dct:type | extra:dcat_type | | text | As of DCAT-AP v1.1 there's no controlled vocabulary for this field | -| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:source | extra:source | | list | See note about lists. It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | adms:sample | extra:sample | | list | See note about lists. It is assumed that these are one or more URIs referring to dcat:Distribution instances | -| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See note about the spatial field | +| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:source | extra:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | adms:sample | extra:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | +| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See [Spatial coverage](#spatial-coverage) | | dcat:Dataset | dct:temporal | extra:temporal_start + extra:temporal_end | | text | None, one or both extras can be present | | dcat:Dataset | dcat:temporalResolution| extra:temporal_resolution | | list | | | dcat:Dataset | dcat:spatialResolutionInMeters| extra:spatial_resolution_in_meters | | list | | | dcat:Dataset | dct:isReferencedBy | extra:is_referenced_by | | list | | -| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See note about URIs | +| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See [URIs](#uris-1) and [Publisher](#contact-points-and-publisher) | | foaf:Agent | foaf:name | extra:publisher_name | | text | | | foaf:Agent | foaf:mbox | extra:publisher_email | organization:title | text | | | foaf:Agent | foaf:homepage | extra:publisher_url | | text | | | foaf:Agent | dct:type | extra:publisher_type | | text | | -| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See note about URIs | +| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See [URIs](#uris-1) and [Contact points](#contact-points-and-publisher) | | vcard:Kind | vcard:fn | extra:contact_name | maintainer, author | text | | | vcard:Kind | vcard:hasEmail | extra:contact_email | maintainer_email, author_email | text | | | dcat:Dataset | dcat:distribution | resources | | text | | -| dcat:Distribution | - | resource:uri | | text | See note about URIs | +| dcat:Distribution | - | resource:uri | | text | See [URIs](#uris-1) | | dcat:Distribution | dct:title | resource:name | | text | | | dcat:Distribution | dcat:accessURL | resource:access_url | resource:url | text | If downloadURL is not present, accessURL will be used as resource url | | dcat:Distribution | dcat:downloadURL | resource:download_url | | text | If present, downloadURL will be used as resource url | | dcat:Distribution | dct:description | resource:description | | text | | | dcat:Distribution | dcat:mediaType | resource:mimetype | | text | | -| dcat:Distribution | dct:format | resource:format | | text | This is likely to require extra logic to accommodate how CKAN deals with formats (eg ckan/ckanext-dcat#18) | -| dcat:Distribution | dct:license | resource:license | | text | See note about dataset license | +| dcat:Distribution | dct:format | resource:format | | text | | +| dcat:Distribution | dct:license | resource:license | | text | See [Licenses](#licenses) | | dcat:Distribution | adms:status | resource:status | | text | | | dcat:Distribution | dcat:byteSize | resource:size | | number | | | dcat:Distribution | dct:issued | resource:issued | created | text | | | dcat:Distribution | dct:modified | resource:modified | metadata_modified | text | | | dcat:Distribution | dct:rights | resource:rights | | text | | -| dcat:Distribution | foaf:page | resource:documentation | | list | See note about lists | -| dcat:Distribution | dct:language | resource:language | | list | See note about lists | -| dcat:Distribution | dct:conformsTo | resource:conforms_to | | list | See note about lists | +| dcat:Distribution | foaf:page | resource:documentation | | list | See [Lists](#lists) | +| dcat:Distribution | dct:language | resource:language | | list | See [Lists](#lists) | +| dcat:Distribution | dct:conformsTo | resource:conforms_to | | list | See [Lists](#lists) | | dcat:Distribution | dcatap:availability | resource:availability | | text | | | dcat:Distribution | dcat:compressFormat | resource:compress_format | | text | | | dcat:Distribution | dcat:packageFormat | resource:package_format | | text | | @@ -616,6 +616,8 @@ vs: } ``` +If no `publisher` or `publisher_*` fields are found, the serializers will fall back to getting the publisher properties from the organization the CKAN dataset belongs to. The organization schema can be customized with the schema located in `ckanext/dcat/schemas/publisher_organization.yaml` to provide the extra properties supported (this will additionally require loading the `scheming_organizations` plugin in `ckan.plugins`). + ### Spatial coverage From 030cd3dbc3011d26c2b45492662afdd400d57579 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 11 Jun 2024 11:19:53 +0200 Subject: [PATCH 42/52] [#56] Docstrings --- ckanext/dcat/profiles/euro_dcat_ap_scheming.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 6ff50a39..a93482a8 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -21,10 +21,16 @@ class EuropeanDCATAPSchemingProfile(RDFProfile): `euro_dcat_ap` and `euro_dcat_ap_2` profiles. It does not add or remove any properties from these profiles, it just transforms the resulting dataset_dict so it is compatible with a ckanext-scheming schema - TODO: summarize changes and link to docs """ def parse_dataset(self, dataset_dict, dataset_ref): + """ + Modify the dataset_dict generated by the euro_dcat_ap andeuro_dcat_ap_2 profiles + to make it compatible with the scheming file definitions: + * Move extras to root level fields + * Parse lists (multiple text preset) + * Turn namespaced extras into repeating subfields + """ if not self._dataset_schema: # Not using scheming @@ -110,6 +116,9 @@ def _parse_list_value(data_dict, field_name): return dataset_dict def graph_from_dataset(self, dataset_dict, dataset_ref): + """ + Add triples to the graph from new repeating subfields + """ contact = dataset_dict.get("contact") if isinstance(contact, list) and len(contact): From 5fffa1515e88168f5e20310dd9559af2f33e5481 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 11 Jun 2024 13:09:56 +0200 Subject: [PATCH 43/52] [#56] Fix function call --- ckanext/dcat/profiles/euro_dcat_ap_scheming.py | 4 ++-- ckanext/dcat/tests/test_scheming_support.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index a93482a8..c945deff 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -157,10 +157,10 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name") self._add_triple_from_dict( - publisher, publisher_ref, FOAF.homepage, "url", URIRef + publisher, publisher_ref, FOAF.homepage, "url", _type=URIRef ) self._add_triple_from_dict( - publisher, publisher_ref, DCT.type, "type", URIRefOrLiteral + publisher, publisher_ref, DCT.type, "type", _type=URIRefOrLiteral ) self._add_triple_from_dict( publisher, diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index a77bacb0..f39f0fdd 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -291,7 +291,7 @@ def test_e2e_ckan_to_dcat(self): g, publisher[0][2], FOAF.homepage, - dataset_dict["publisher"][0]["url"], + URIRef(dataset_dict["publisher"][0]["url"]), ) assert self._triple( g, From ad353596723c6455d1f206836643e0bbee09ba7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Wed, 12 Jun 2024 12:46:22 +0200 Subject: [PATCH 44/52] Schemas description Co-authored-by: Ian Ward --- ckanext/dcat/schemas/dcat_ap_2.1_full.yaml | 2 +- ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml index 051890ee..459694ca 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -1,6 +1,6 @@ scheming_version: 2 dataset_type: dataset -about: A reimplementation of the default CKAN dataset schema +about: Full DCAT AP 2.1 schema about_url: http://github.com/ckan/ckanext-dcat dataset_fields: diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml index a8b5bf1e..ed386d67 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_recommended.yaml @@ -1,6 +1,6 @@ scheming_version: 2 dataset_type: dataset -about: A reimplementation of the default CKAN dataset schema +about: Recommended fields for DCAT AP 2.1 schema about_url: http://github.com/ckan/ckanext-dcat dataset_fields: From b600493c1b0228a27655a6aa504eb1f1308bd028 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 13 Jun 2024 12:39:51 +0200 Subject: [PATCH 45/52] [#56] Index subfields as extras_ Solr field As this is a `text` field that allows free text search --- ckanext/dcat/plugins/__init__.py | 2 +- ckanext/dcat/tests/test_scheming_support.py | 28 +++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 087eb2f7..2aef170b 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -173,7 +173,7 @@ def before_dataset_index(self, dataset_dict): value = item[key] if not isinstance(value, dict): # Index a flattened version - new_key = f'{field["field_name"]}__{key}' + new_key = f'extras_{field["field_name"]}__{key}' if not dataset_dict.get(new_key): dataset_dict[new_key] = value else: diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index f39f0fdd..913cb4a9 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -769,12 +769,36 @@ def test_repeating_subfields_index(self): # Dict sent to Solr search_dict = m.mock_calls[1].kwargs["docs"][0] - assert search_dict["contact__name"] == "Contact 1 Contact 2" + assert search_dict["extras_contact__name"] == "Contact 1 Contact 2" assert ( - search_dict["contact__email"] + search_dict["extras_contact__email"] == "contact1@example.org contact2@example.org" ) + def test_repeating_subfields_search(self): + + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Some notes", + # Repeating subfields + "contact": [ + {"name": "Contact 1", "email": "contact1@example.org"}, + {"name": "Contact 2", "email": "contact2@example.org"}, + ], + } + + dataset = call_action("package_create", **dataset_dict) + + result = call_action("package_search", q="Contact 2") + + assert result["results"][0]["id"] == dataset["id"] + + result = call_action("package_search", q="Contact 3") + + assert result["count"] == 0 + def test_spatial_field(self): dataset_dict = { From f88e4335e5597366fa7398a88ef12dcddd92f713 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 13 Jun 2024 13:04:34 +0200 Subject: [PATCH 46/52] [#56] Clean the index before tests --- ckanext/dcat/tests/test_scheming_support.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 913cb4a9..d8770e2f 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -737,7 +737,7 @@ def test_e2e_dcat_to_ckan(self): ] -@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" From 898912ca19a43efe65fc42c9a8952eaf61641e89 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Jun 2024 11:44:07 +0200 Subject: [PATCH 47/52] [#56] Avoid empty list in spatial resolution --- ckanext/dcat/profiles/base.py | 2 +- ckanext/dcat/validators.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index a2eddd71..d1ff561b 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -713,7 +713,7 @@ def _read_list_value(self, value): # List of values if isinstance(value, list): items = value - elif isinstance(value, str): + elif value and isinstance(value, str): try: items = json.loads(value) if isinstance(items, ((int, float, complex))): diff --git a/ckanext/dcat/validators.py b/ckanext/dcat/validators.py index 6dae17af..7e110f5b 100644 --- a/ckanext/dcat/validators.py +++ b/ckanext/dcat/validators.py @@ -86,7 +86,7 @@ def _scheming_multiple_number(key, data, errors, context): return value = data[key] - if value is not missing: + if value and value is not missing: if not isinstance(value, list): if isinstance(value, str) and value.startswith("["): From 97e68de9a270c62ba7de8cfe05adb92343df287b Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Jun 2024 12:08:39 +0200 Subject: [PATCH 48/52] [#56] Markdown for provenance --- ckanext/dcat/schemas/dcat_ap_2.1_full.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml index 459694ca..d9532011 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -119,6 +119,8 @@ dataset_fields: - field_name: provenance label: Provenance + form_snippet: markdown.html + display_snippet: markdown.html help_text: A statement about the lineage of the dataset. - field_name: dcat_type From a8a3f255ec18d822712e0b0d900e90e7d5ed6949 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Jun 2024 12:23:16 +0200 Subject: [PATCH 49/52] [#56] Don't serialize empty repeating subfields Scheming adds a dict with empty keys when empty repeating subfields are submitted from the form. Check that there's an actual value before creating the triples when serializing --- .../dcat/profiles/euro_dcat_ap_scheming.py | 11 ++++++---- ckanext/dcat/tests/test_scheming_support.py | 22 +++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index c945deff..12eb540e 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -120,8 +120,11 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): Add triples to the graph from new repeating subfields """ + def _not_empty_dict(data_dict): + return any(data_dict.values()) + contact = dataset_dict.get("contact") - if isinstance(contact, list) and len(contact): + if isinstance(contact, list) and len(contact) and _not_empty_dict(contact[0]): for item in contact: contact_uri = item.get("uri") if contact_uri: @@ -144,7 +147,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) publisher = dataset_dict.get("publisher") - if isinstance(publisher, list) and len(publisher): + if isinstance(publisher, list) and len(publisher) and _not_empty_dict(publisher[0]): publisher = publisher[0] publisher_uri = publisher.get("uri") if publisher_uri: @@ -172,7 +175,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) temporal = dataset_dict.get("temporal_coverage") - if isinstance(temporal, list) and len(temporal): + if isinstance(temporal, list) and len(temporal) and _not_empty_dict(temporal[0]): for item in temporal: temporal_ref = BNode() self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime)) @@ -183,7 +186,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self.g.add((dataset_ref, DCT.temporal, temporal_ref)) spatial = dataset_dict.get("spatial_coverage") - if isinstance(spatial, list) and len(spatial): + if isinstance(spatial, list) and len(spatial) and _not_empty_dict(spatial[0]): for item in spatial: if item.get("uri"): spatial_ref = CleanedURIRef(item["uri"]) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index d8770e2f..3779780d 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -514,6 +514,28 @@ def test_publisher_fallback_org_ignored_if_publisher_field_present(self): g, publisher[0][2], FOAF.name, dataset_dict["publisher"][0]["name"] ) + def test_empty_repeating_subfields_not_serialized(self): + + dataset_dict = { + "name": "test-dataset-3", + "title": "Test DCAT dataset 3", + "notes": "Lorem ipsum", + "spatial_coverage": [ + { + "uri": "", + "geom": "", + }, + ], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + assert not [t for t in g.triples((dataset_ref, DCT.spatial, None))] + def test_legacy_fields(self): dataset_dict = { From c7b8c027495f4dece84562090b44fd20f1374604 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 2 Jul 2024 16:37:34 +0200 Subject: [PATCH 50/52] [#56] More robust date parsing with dateutil, expand tests --- ckanext/dcat/tests/test_scheming_support.py | 138 ++++++++++++-------- ckanext/dcat/tests/test_validators.py | 27 +++- ckanext/dcat/validators.py | 8 +- 3 files changed, 114 insertions(+), 59 deletions(-) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 3779780d..9c9b1065 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -369,7 +369,9 @@ def test_e2e_ckan_to_dcat(self): # Resources: standard fields assert self._triple(g, distribution_ref, DCT.rights, resource["rights"]) - assert self._triple(g, distribution_ref, ADMS.status, URIRef(resource["status"])) + assert self._triple( + g, distribution_ref, ADMS.status, URIRef(resource["status"]) + ) assert self._triple( g, distribution_ref, @@ -566,6 +568,88 @@ def test_legacy_fields(self): assert len(publisher) == 1 assert self._triple(g, publisher[0][2], FOAF.name, "Test Publisher") + def test_dcat_date(self): + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Some notes", + "issued": "2024", + "modified": "2024-10", + "temporal_coverage": [ + {"start": "1905-03-01T10:07:31.182680", "end": "2013-01-05"}, + {"start": "2024-04-10T10:07:31", "end": "2024-05-29"}, + ], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + # Year + assert dataset["issued"] == dataset_dict["issued"] + assert self._triple( + g, + dataset_ref, + DCT.issued, + dataset_dict["issued"], + data_type=XSD.gYear, + ) + + # Year-month + assert dataset["modified"] == dataset_dict["modified"] + assert self._triple( + g, + dataset_ref, + DCT.modified, + dataset_dict["modified"], + data_type=XSD.gYearMonth, + ) + + temporal = [t for t in g.triples((dataset_ref, DCT.temporal, None))] + + # Date + assert ( + dataset["temporal_coverage"][0]["end"] + == dataset_dict["temporal_coverage"][0]["end"] + ) + + assert self._triple( + g, + temporal[0][2], + SCHEMA.endDate, + dataset_dict["temporal_coverage"][0]["end"], + data_type=XSD.date, + ) + + # Datetime + assert ( + dataset["temporal_coverage"][0]["start"] + == dataset_dict["temporal_coverage"][0]["start"] + ) + assert self._triple( + g, + temporal[0][2], + SCHEMA.startDate, + dataset_dict["temporal_coverage"][0]["start"], + data_type=XSD.dateTime, + ) + + assert ( + dataset["temporal_coverage"][1]["start"] + == dataset_dict["temporal_coverage"][1]["start"] + ) + assert self._triple( + g, + temporal[1][2], + SCHEMA.startDate, + dataset_dict["temporal_coverage"][1]["start"], + data_type=XSD.dateTime, + ) + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -859,55 +943,3 @@ def test_spatial_field(self): assert search_dict["spatial"] == json.dumps( dataset_dict["spatial_coverage"][0]["centroid"] ) - - -@pytest.mark.usefixtures("with_plugins", "clean_db") -@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") -@pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" -) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) -@pytest.mark.ckan_config( - "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" -) -class TestSchemingPresets: - def test_dcat_date(self): - dataset_dict = { - # Core fields - "name": "test-dataset", - "title": "Test DCAT dataset", - "notes": "Some notes", - "issued": "2024", - "modified": "2024-10", - "temporal_coverage": [ - {"start": "1905-03-01T10:07:31.182680", "end": "2013-01-05"}, - {"start": "2024-04-10T10:07:31", "end": "2024-05-29"}, - ], - } - - dataset = call_action("package_create", **dataset_dict) - - # Year - assert dataset["issued"] == dataset_dict["issued"] - - # Year-month - assert dataset["modified"] == dataset_dict["modified"] - - # Date - assert ( - dataset["temporal_coverage"][0]["end"] - == dataset_dict["temporal_coverage"][0]["end"] - ) - - # Datetime - assert ( - dataset["temporal_coverage"][0]["start"] - == dataset_dict["temporal_coverage"][0]["start"] - ) - assert ( - dataset["temporal_coverage"][1]["start"] - == dataset_dict["temporal_coverage"][1]["start"] - ) diff --git a/ckanext/dcat/tests/test_validators.py b/ckanext/dcat/tests/test_validators.py index 97edf17c..e1f6944e 100644 --- a/ckanext/dcat/tests/test_validators.py +++ b/ckanext/dcat/tests/test_validators.py @@ -1,8 +1,10 @@ +import datetime import json + import pytest from ckantoolkit import StopOnError -from ckanext.dcat.validators import scheming_multiple_number +from ckanext.dcat.validators import scheming_multiple_number, dcat_date def test_scheming_multiple_number(): @@ -60,3 +62,26 @@ def test_scheming_multiple_number_wrong_value(): assert errors[key][0].startswith("invalid type for repeating number") errors = {key: []} + + +@pytest.mark.usefixtures( + "with_plugins", +) +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +def test_dcat_date_valid(): + + key = ("some_date",) + errors = {key: []} + valid_values = [ + datetime.datetime.now(), + "2024", + "2024-07", + "2024-07-01", + "1905-03-01T10:07:31.182680", + "2024-04-10T10:07:31", + "2024-04-10T10:07:31.000Z", + ] + + for value in valid_values: + data = {key: value} + dcat_date(key, data, errors, {}), value diff --git a/ckanext/dcat/validators.py b/ckanext/dcat/validators.py index 7e110f5b..c9ee7d50 100644 --- a/ckanext/dcat/validators.py +++ b/ckanext/dcat/validators.py @@ -2,10 +2,10 @@ import json import re +from dateutil.parser import parse as parse_date from ckantoolkit import ( missing, StopOnError, - get_validator, Invalid, _, ) @@ -41,8 +41,6 @@ def is_date(value): def dcat_date(key, data, errors, context): value = data[key] - scheming_isodatetime = get_validator("scheming_isodatetime") - if isinstance(value, datetime.datetime): return @@ -50,8 +48,8 @@ def dcat_date(key, data, errors, context): return try: - scheming_isodatetime({}, {})(key, data, errors, context) - except Invalid: + parse_date(value) + except ValueError: raise Invalid( _( "Date format incorrect. Supported formats are YYYY, YYYY-MM, YYYY-MM-DD and YYYY-MM-DDTHH:MM:SS" From 39b4d9162c30772b4c10db17ee7126b734c95319 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 3 Jul 2024 12:10:16 +0200 Subject: [PATCH 51/52] [#56] Add tests for invalid and ambiguous dates --- ckanext/dcat/tests/test_scheming_support.py | 25 +++++++++++++++++++++ ckanext/dcat/tests/test_validators.py | 21 ++++++++++++----- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 9c9b1065..d79523fd 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -579,6 +579,7 @@ def test_dcat_date(self): "temporal_coverage": [ {"start": "1905-03-01T10:07:31.182680", "end": "2013-01-05"}, {"start": "2024-04-10T10:07:31", "end": "2024-05-29"}, + {"start": "11/24/24", "end": "06/12/12"}, ], } @@ -650,6 +651,30 @@ def test_dcat_date(self): data_type=XSD.dateTime, ) + # Ambiguous Datetime + assert ( + dataset["temporal_coverage"][2]["start"] + == dataset_dict["temporal_coverage"][2]["start"] + ) + assert self._triple( + g, + temporal[2][2], + SCHEMA.startDate, + "2024-11-24T00:00:00", + data_type=XSD.dateTime, + ) + assert ( + dataset["temporal_coverage"][2]["end"] + == dataset_dict["temporal_coverage"][2]["end"] + ) + assert self._triple( + g, + temporal[2][2], + SCHEMA.endDate, + "2012-06-12T00:00:00", + data_type=XSD.dateTime, + ) + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") diff --git a/ckanext/dcat/tests/test_validators.py b/ckanext/dcat/tests/test_validators.py index e1f6944e..700cc644 100644 --- a/ckanext/dcat/tests/test_validators.py +++ b/ckanext/dcat/tests/test_validators.py @@ -3,7 +3,7 @@ import pytest -from ckantoolkit import StopOnError +from ckantoolkit import StopOnError, Invalid from ckanext.dcat.validators import scheming_multiple_number, dcat_date @@ -64,10 +64,6 @@ def test_scheming_multiple_number_wrong_value(): errors = {key: []} -@pytest.mark.usefixtures( - "with_plugins", -) -@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") def test_dcat_date_valid(): key = ("some_date",) @@ -85,3 +81,18 @@ def test_dcat_date_valid(): for value in valid_values: data = {key: value} dcat_date(key, data, errors, {}), value + + +def test_dcat_date_invalid(): + + key = ("some_date",) + errors = {key: []} + invalid_values = [ + "2024+07", + "not_a_date", + ] + + for value in invalid_values: + data = {key: value} + with pytest.raises(Invalid): + dcat_date(key, data, errors, {}), value From ae78f0f4fbebb3b7b2177170e34f25b34de8d945 Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 5 Jul 2024 12:00:41 +0200 Subject: [PATCH 52/52] [#56] Update changelog with scheming changes --- CHANGELOG.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c48332a9..c1aa08c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,27 @@ ## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v1.7.0...HEAD) +* Support for standard CKAN [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas. + The DCAT profiles now seamlessly integrate with fields defined via the YAML or JSON scheming files. + Sites willing to migrate to a scheming based metadata schema can do + so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. + `ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile + outputs to the expected format by the scheming validators. Sample schemas are provided + in the `ckanext/dcat/schemas` folder. See the [documentation](https://github.com/ckan/ckanext-dcat?tab=readme-ov-file#schemas) + for all details. Some highlights of the new scheming based profiles: + + * Actual list support in the API ooutput for list properties like `dct:language` + * Multiple objects now allowed for properties like `dcat:ContactPoint`, `dct:spatial` or `dct:temporal` + * Custom validators for date values that allow `xsd:gYear`, `xsd:gYearMonth`, `xsd:date` and `xsd:dateTime` + + (#281) +* New `ckan dcat consume` and `ckan dcat produce` CLI commands (#279) +* Parse dcat:spatialResolutionInMeters as float (#285) +* Split profile classes into their own separate files (#282) +* Catch Not Authorized in View (#280) +* CKAN 2.11 support and requirements updates (#270) + + ## [v1.7.0](https://github.com/ckan/ckanext-dcat/compare/v1.6.0...v1.7.0) - 2024-04-04 * Adds support for the latest Hydra vocabulary. For backward compatibility, the old properties are still supported but marked as deprecated. (#267)