From d7c95972ece37123f1824173f068acabb6b4ac52 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 18 Jun 2024 14:28:25 +0200 Subject: [PATCH 01/18] First iteration of Shacl validation for DCAT-AP 2.1.1 (scheming) --- .../shacl/dcat-ap_2.1.1_shacl_shapes.ttl | 639 ++++++++++++++++++ ckanext/dcat/tests/test_shacl.py | 155 +++++ dev-requirements.txt | 1 + 3 files changed, 795 insertions(+) create mode 100644 ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes.ttl create mode 100644 ckanext/dcat/tests/test_shacl.py diff --git a/ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes.ttl b/ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes.ttl new file mode 100644 index 00000000..8c441529 --- /dev/null +++ b/ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes.ttl @@ -0,0 +1,639 @@ +@prefix rdf: . +@prefix : . +@prefix adms: . +@prefix cc: . +@prefix dc: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix lcon: . +@prefix org: . +@prefix owl: . +@prefix odrl: . +@prefix prov: . +@prefix rdfs: . +@prefix schema: . +@prefix sh: . +@prefix skos: . +@prefix spdx: . +@prefix time: . +@prefix vcard: . +@prefix xsd: . +@prefix dcatap: . + + + dcat:accessURL ; + dcat:downloadURL ; + dcatap:availability ; + dct:format ; + dct:conformsTo ; + dct:creator [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Bert Van Nuffelen" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Natasa Sofou" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Eugeniu Costetchi" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Makx Dekkers" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Nikolaos Loutas" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Vassilios Peristeras" + ] ; + dct:license ; + cc:attributionURL ; + dct:modified "2021-12-01"^^xsd:date ; + dct:publisher ; + dct:relation ; + dct:description "This document specifies the constraints on properties and classes expressed by DCAT-AP in SHACL."@en ; + dct:title "The constraints of DCAT Application Profile for Data Portals in Europe"@en ; + owl:versionInfo "2.1.1" ; + foaf:homepage ; + foaf:maker [ + foaf:mbox ; + foaf:name "DCAT-AP Working Group" ; + foaf:page , + ] . + + + +#------------------------------------------------------------------------- +# The shapes in this file cover all classes in DCAT-AP 2.1.1. +# It covers all constraints that must be satisfied except those checking the ranges. +# +#------------------------------------------------------------------------- + +:Agent_Shape + a sh:NodeShape ; + sh:name "Agent"@en ; + sh:property [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path foaf:name ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:type ; + sh:severity sh:Violation + ] ; + sh:targetClass foaf:Agent . + +:CatalogRecord_Shape + a sh:NodeShape ; + sh:name "Catalog Record"@en ; + sh:property [ + sh:maxCount 1 ; + sh:minCount 1 ; + sh:node :DcatResource_Shape ; + sh:path foaf:primaryTopic ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:minCount 1 ; + sh:path dct:modified ; + sh:severity sh:Violation ; + sh:node :DateOrDateTimeDataType_Shape + ], [ + sh:maxCount 1 ; + sh:path dct:conformsTo ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:node :DateOrDateTimeDataType_Shape ; + sh:path dct:issued ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path adms:status ; + sh:severity sh:Violation + ], [ + sh:path dct:language ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:source ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dct:title ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dct:description ; + sh:severity sh:Violation + ] ; + sh:targetClass dcat:CatalogRecord . + +:Catalog_Shape + a sh:NodeShape ; + sh:name "Catalog"@en ; + sh:property [ + sh:path dct:language ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:license ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:node :DateOrDateTimeDataType_Shape ; + sh:path dct:issued ; + sh:severity sh:Violation + ], [ + sh:path dct:spatial ; + sh:severity sh:Violation + ], [ + sh:path dct:hasPart ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:isPartOf ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:node :DateOrDateTimeDataType_Shape ; + sh:path dct:modified ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:rights ; + sh:severity sh:Violation + ], [ + sh:path dcat:record ; + sh:severity sh:Violation + ], [ + sh:path dcat:themeTaxonomy ; + sh:severity sh:Violation + ], [ + sh:path dcat:service ; + sh:severity sh:Violation + ], [ + sh:path dcat:catalog ; + sh:severity sh:Violation + ], [ + sh:path dct:creator ; + sh:severity sh:Violation + ], [ + sh:path dcat:dataset ; + sh:severity sh:Violation + ], [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dct:description ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:minCount 1 ; + sh:path dct:publisher ; + sh:severity sh:Violation + ], [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dct:title ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path foaf:homepage ; + sh:severity sh:Violation + ] ; + sh:targetClass dcat:Catalog . + +:CategoryScheme_Shape + a sh:NodeShape ; + sh:name "Category Scheme"@en ; + sh:property [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dct:title ; + sh:severity sh:Violation + ] ; + sh:targetClass skos:ConceptScheme . + +:Category_Shape + a sh:NodeShape ; + sh:name "Category"@en ; + sh:property [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path skos:prefLabel ; + sh:severity sh:Violation + ] ; + sh:targetClass skos:Concept . + +:Checksum_Shape + a sh:NodeShape ; + sh:name "Checksum"@en ; + sh:property [ + sh:hasValue spdx:checksumAlgorithm_sha1 ; + sh:maxCount 1 ; + sh:minCount 1 ; + sh:path spdx:algorithm ; + sh:severity sh:Violation + ], [ + sh:datatype xsd:hexBinary ; + sh:maxCount 1 ; + sh:minCount 1 ; + sh:path spdx:checksumValue ; + sh:severity sh:Violation + ] ; + sh:targetClass spdx:Checksum . + +:DataService_Shape + a sh:NodeShape ; + sh:name "Data Service"@en ; + sh:property [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dct:title ; + sh:severity sh:Violation + ], [ + sh:minCount 1 ; + sh:nodeKind sh:BlankNodeOrIRI ; + sh:path dcat:endpointURL ; + sh:severity sh:Violation + ], [ + sh:path dcat:servesDataset ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dct:description ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:BlankNodeOrIRI ; + sh:path dcat:endpointDescription ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:license ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:accessRights ; + sh:severity sh:Violation + ] ; + sh:targetClass dcat:DataService . + +:Dataset_Shape + a sh:NodeShape ; + sh:name "Dataset"@en ; + sh:property [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dct:description ; + sh:severity sh:Violation + ], [ + sh:minCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dct:title ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dct:identifier ; + sh:severity sh:Violation + ], [ + sh:path dcat:contactPoint ; + sh:severity sh:Violation + ], [ + sh:path dcat:distribution ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dcat:keyword ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:publisher ; + sh:severity sh:Violation + ], [ + sh:path dct:spatial ; + sh:severity sh:Violation + ], [ + sh:path dct:temporal ; + sh:severity sh:Violation + ], [ + sh:path dcat:theme ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:accessRights ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:accrualPeriodicity ; + sh:severity sh:Violation + ], [ + sh:path dct:conformsTo ; + sh:severity sh:Violation + ], [ + sh:path dct:hasVersion ; + sh:severity sh:Violation + ], [ + sh:path dct:isVersionOf ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:issued ; + sh:severity sh:Violation ; + sh:node :DateOrDateTimeDataType_Shape + ], [ + sh:path dct:language ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:modified ; + sh:severity sh:Violation ; + sh:node :DateOrDateTimeDataType_Shape + ], [ + sh:path dct:provenance ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:BlankNodeOrIRI ; + sh:path dct:relation ; + sh:severity sh:Violation + ], [ + sh:path dct:source ; + sh:severity sh:Violation + ], [ + sh:path dct:type ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:nodeKind sh:Literal ; + sh:path owl:versionInfo ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path adms:versionNotes ; + sh:severity sh:Violation + ], [ + sh:path adms:identifier ; + sh:severity sh:Violation + ], [ + sh:path adms:sample ; + sh:severity sh:Violation + ], [ + sh:path dcat:landingPage ; + sh:severity sh:Violation + ], [ + sh:path foaf:page ; + sh:severity sh:Violation + ], [ + sh:path dcat:qualifiedRelation ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:BlankNodeOrIRI ; + sh:path dc:isReferencedBy ; + sh:severity sh:Violation + ], [ + sh:path prov:qualifiedAttribution ; + sh:severity sh:Violation + ], [ + sh:path prov:wasGeneratedBy ; + sh:severity sh:Violation + ], [ + sh:datatype xsd:duration ; + sh:maxCount 1 ; + sh:path dcat:temporalResolution ; + sh:severity sh:Violation + ], [ + sh:datatype xsd:decimal ; + sh:maxCount 1 ; + sh:path dcat:spatialResolutionInMeters ; + sh:severity sh:Violation + ], [ + sh:path dct:creator ; + sh:severity sh:Violation + ] ; + sh:targetClass dcat:Dataset . + +:DateOrDateTimeDataType_Shape + a sh:NodeShape ; + rdfs:comment "Date time date disjunction shape checks that a datatype property receives a temporal value: date, dateTime, gYear or gYearMonth literal" ; + rdfs:label "Date time date disjunction" ; + sh:message "The values must be data typed as either xsd:date, xsd:dateTime, xsd:gYear or xsd:gYearMonth" ; + sh:or ([ + sh:datatype xsd:date + ] + [ + sh:datatype xsd:dateTime + ] + [ + sh:datatype xsd:gYear + ] + [ + sh:datatype xsd:gYearMonth + ] + ) . + +:DcatResource_Shape + a sh:NodeShape ; + rdfs:comment "the union of Catalog, Dataset and DataService" ; + rdfs:label "dcat:Resource" ; + sh:message "The node is either a Catalog, Dataset or a DataService" ; + sh:or ([ + sh:class dcat:Catalog + ] + [ + sh:class dcat:Dataset + ] + [ + sh:class dcat:DataService + ] + ) . + +:Distribution_Shape + a sh:NodeShape ; + sh:name "Distribution"@en ; + sh:property [ + sh:path dct:conformsTo ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:node :DateOrDateTimeDataType_Shape ; + sh:path dct:issued ; + sh:severity sh:Violation + ], [ + sh:path dct:language ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:node :DateOrDateTimeDataType_Shape ; + sh:path dct:modified ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:rights ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dct:title ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path spdx:checksum ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path adms:status ; + sh:severity sh:Violation + ], [ + sh:datatype xsd:decimal ; + sh:maxCount 1 ; + sh:path dcat:byteSize ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:BlankNodeOrIRI; + sh:path dcat:downloadURL ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dcat:mediaType ; + sh:severity sh:Violation + ], [ + sh:path foaf:page ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path odrl:hasPolicy ; + sh:severity sh:Violation + ], [ + sh:path dcat:accessService ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dcat:compressFormat ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dcat:packageFormat ; + sh:severity sh:Violation + ], [ + sh:datatype xsd:duration ; + sh:maxCount 1 ; + sh:path dcat:temporalResolution ; + sh:severity sh:Violation + ], [ + sh:datatype xsd:decimal ; + sh:maxCount 1 ; + sh:path dcat:spatialResolutionInMeters ; + sh:severity sh:Violation + ], [ + sh:minCount 1 ; + sh:nodeKind sh:BlankNodeOrIRI; + sh:path dcat:accessURL ; + sh:severity sh:Violation + ], [ + sh:nodeKind sh:Literal ; + sh:path dct:description ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dcatap:availability ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:format ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dct:license ; + sh:severity sh:Violation + ] ; + sh:targetClass dcat:Distribution . + +:Identifier_Shape + a sh:NodeShape ; + sh:name "Identifier"@en ; + sh:property [ + sh:maxCount 1 ; + sh:path skos:notation ; + sh:severity sh:Violation + ] ; + sh:targetClass adms:Identifier . + +:LicenceDocument_Shape + a sh:NodeShape ; + sh:name "Licence Document"@en ; + sh:property [ + sh:path dct:type ; + sh:severity sh:Violation + ] ; + sh:targetClass dct:LicenseDocument . + +:Location_Shape + a sh:NodeShape ; + sh:name "Location"@en ; + sh:property [ + sh:maxCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dcat:bbox ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:nodeKind sh:Literal ; + sh:path dcat:centroid ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:nodeKind sh:Literal ; + sh:path lcon:geometry ; + sh:severity sh:Violation + ] ; + sh:targetClass dct:Location . + +:PeriodOfTime_Shape + a sh:NodeShape ; + sh:name "PeriodOfTime"@en ; + sh:property [ + sh:maxCount 1 ; + sh:path dcat:endDate ; + sh:severity sh:Violation ; + sh:node :DateOrDateTimeDataType_Shape + ], [ + sh:maxCount 1 ; + sh:path time:hasBeginning ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path time:hasEnd ; + sh:severity sh:Violation + ], [ + sh:maxCount 1 ; + sh:path dcat:startDate ; + sh:severity sh:Violation ; + sh:node :DateOrDateTimeDataType_Shape + ] ; + sh:targetClass dct:PeriodOfTime . + +:Relationship_Shape + a sh:NodeShape ; + sh:name "Relationship"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dct:relation ; + sh:severity sh:Violation + ], [ + sh:minCount 1 ; + sh:path dcat:hadRole ; + sh:severity sh:Violation + ] ; + sh:targetClass dcat:Relationship . + diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py new file mode 100644 index 00000000..c10e8fe3 --- /dev/null +++ b/ckanext/dcat/tests/test_shacl.py @@ -0,0 +1,155 @@ +import os + +from pyshacl import validate +import pytest + +from ckan.tests.helpers import call_action + +from ckanext.dcat.processors import RDFSerializer + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +def test_validate_dcat_ap_2(): + + dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Lorem ipsum", + "url": "http://example.org/ds1", + "version": "1.0b", + "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], + # Standard fields + "issued": "2024-05-01", + "modified": "2024-05-05", + "identifier": "xx-some-dataset-id-yy", + "frequency": "monthly", + "provenance": "Statement about provenance", + "dcat_type": "test-type", + "version_notes": "Some version notes", + "access_rights": "Statement about access rights", + # List fields (lists) + "alternate_identifier": ["alt-id-1", "alt-id-2"], + "theme": [ + "https://example.org/uri/theme1", + "https://example.org/uri/theme2", + "https://example.org/uri/theme3", + ], + "language": ["en", "ca", "es"], + "documentation": ["https://example.org/some-doc.html"], + "conforms_to": ["Standard 1", "Standard 2"], + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2018.22", + "test_isreferencedby", + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg_impl/2023/138/oj", + "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", + ], + # Repeating subfields + "contact": [ + {"name": "Contact 1", "email": "contact1@example.org"}, + {"name": "Contact 2", "email": "contact2@example.org"}, + ], + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body", + }, + ], + "temporal_coverage": [ + {"start": "1905-03-01", "end": "2013-01-05"}, + {"start": "2024-04-10", "end": "2024-05-29"}, + ], + "temporal_resolution": ["PT15M", "P1D"], + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [ + [ + [11.9936, 54.0486], + [11.9936, 54.2466], + [12.3045, 54.2466], + [12.3045, 54.0486], + [11.9936, 54.0486], + ] + ], + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [-2.1604, 42.7611], + [-2.0938, 42.7611], + [-2.0938, 42.7931], + [-2.1604, 42.7931], + [-2.1604, 42.7611], + ] + ], + }, + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + } + ], + "spatial_resolution_in_meters": [1.5, 2.0], + "resources": [ + { + "name": "Resource 1", + "description": "Some description", + "url": "https://example.com/data.csv", + "format": "CSV", + "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", + "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", + "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", + "size": 12323, + "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", + "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", + "status": "http://purl.org/adms/status/Completed", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "license": "http://creativecommons.org/licenses/by/3.0/", + "rights": "Some stament about rights", + "language": ["en", "ca", "es"], + "access_services": [ + { + "title": "Access Service 1", + "endpoint_url": [ + "https://example.org/access_service/1", + "https://example.org/access_service/2", + ], + } + ], + } + ], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + s.graph_from_dataset(dataset) + path = os.path.join( + os.path.dirname(__file__), "shacl", "dcat-ap_2.1.1_shacl_shapes.ttl" + ) + r = validate(g, shacl_graph=path) + conforms, results_graph, results_text = r + + assert conforms, results_text diff --git a/dev-requirements.txt b/dev-requirements.txt index 56cdcb47..caf341ac 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,4 +1,5 @@ responses>=0.25.2 +pyshacl mock pytest-ckan pytest-cov From ab3eabe7d329dd5664ebd27d91def73e6d9c09f4 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 18 Jun 2024 14:28:56 +0200 Subject: [PATCH 02/18] Cast file size as an actual Decimal, not a float --- ckanext/dcat/profiles/euro_dcat_ap.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py index b7e4cae4..b0057110 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_dcat_ap.py @@ -1,4 +1,5 @@ import json +from decimal import Decimal, DecimalException from rdflib import term, URIRef, BNode, Literal import ckantoolkit as toolkit @@ -545,10 +546,10 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ( distribution, DCAT.byteSize, - Literal(float(resource_dict["size"]), datatype=XSD.decimal), + Literal(Decimal(resource_dict["size"]), datatype=XSD.decimal), ) ) - except (ValueError, TypeError): + except (ValueError, TypeError, DecimalException): g.add((distribution, DCAT.byteSize, Literal(resource_dict["size"]))) # Checksum if resource_dict.get("hash"): From 8ff3c5c4f525efff8aecddf43c9e60d4f5035bf5 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 18 Jun 2024 17:03:37 +0200 Subject: [PATCH 03/18] Only one value of DCAT.temporalResolution allowed in DCAT-AP 2.1.1 --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 22 ++++++++++++------- ckanext/dcat/schemas/dcat_ap_2.1_full.yaml | 2 -- .../tests/test_euro_dcatap_2_profile_parse.py | 13 +++-------- .../test_euro_dcatap_2_profile_serialize.py | 9 ++++++-- ckanext/dcat/tests/test_scheming_support.py | 21 +++++++++--------- ckanext/dcat/tests/test_shacl.py | 4 +--- examples/dataset.rdf | 1 - 7 files changed, 36 insertions(+), 36 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index c1f9274f..ada96d0a 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -31,9 +31,13 @@ def parse_dataset(self, dataset_dict, dataset_ref): # call super method super(EuropeanDCATAP2Profile, self).parse_dataset(dataset_dict, dataset_ref) + # Standard values + value = self._object_value(dataset_ref, DCAT.temporalResolution) + if value: + dataset_dict["extras"].append({"key": "temporal_resolution", "value": value}) + # Lists for key, predicate in ( - ("temporal_resolution", DCAT.temporalResolution), ("is_referenced_by", DCT.isReferencedBy), ("applicable_legislation", DCATAP.applicableLegislation), ("hvd_category", DCATAP.hvdCategory), @@ -147,15 +151,17 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): dataset_dict, dataset_ref ) + # Standard values + self._add_triple_from_dict( + dataset_dict, + dataset_ref, + DCAT.temporalResolution, + "temporal_resolution", + _datatype=XSD.duration, + ) + # Lists for key, predicate, fallbacks, type, datatype in ( - ( - "temporal_resolution", - DCAT.temporalResolution, - None, - Literal, - XSD.duration, - ), ("is_referenced_by", DCT.isReferencedBy, None, URIRefOrLiteral, None), ( "applicable_legislation", diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml index 459694ca..2761c8a2 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -141,8 +141,6 @@ dataset_fields: - field_name: temporal_resolution label: Temporal resolution - preset: multiple_text - validators: ignore_missing scheming_multiple_text help_text: Minimum time period resolvable in the dataset. - field_name: spatial_coverage diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py index 69ffe7d1..446f69e6 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py @@ -106,9 +106,7 @@ def test_dataset_all_fields(self): # Dataset extras = self._extras(dataset) - temporal_resolution_list = json.loads(extras['temporal_resolution']) - assert len(temporal_resolution_list) == 1 - assert temporal_resolution in temporal_resolution_list + assert extras['temporal_resolution'] == temporal_resolution spatial_resolution_list = json.loads(extras['spatial_resolution_in_meters']) assert len(spatial_resolution_list) == 1 @@ -329,10 +327,8 @@ def test_temporal_resolution_multiple(self): dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) - temporal_resolution = 'P1D' + temporal_resolution = 'PT15M' g.add((dataset, DCAT.temporalResolution, Literal(temporal_resolution, datatype=XSD.duration))) - temporal_resolution_2 = 'PT15M' - g.add((dataset, DCAT.temporalResolution, Literal(temporal_resolution_2, datatype=XSD.duration))) p = RDFParser(profiles=DCAT_AP_PROFILES) @@ -342,10 +338,7 @@ def test_temporal_resolution_multiple(self): extras = self._extras(datasets[0]) - temporal_resolution_list = json.loads(extras['temporal_resolution']) - assert len(temporal_resolution_list) == 2 - assert temporal_resolution in temporal_resolution_list - assert temporal_resolution_2 in temporal_resolution_list + assert extras['temporal_resolution'] == temporal_resolution def test_spatial_resolution_in_meters_multiple(self): g = Graph() diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py index abf80363..89e2e142 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py @@ -42,7 +42,7 @@ def test_graph_from_dataset(self): 'metadata_created': '2021-06-21T15:21:09.034694', 'metadata_modified': '2021-06-21T15:21:09.075774', 'extras': [ - {'key': 'temporal_resolution', 'value': '[\"PT15M\", \"P1D\"]'}, + {'key': 'temporal_resolution', 'value': 'PT15M'}, {'key': 'spatial_resolution_in_meters', 'value': '[30,20]'}, {'key': 'is_referenced_by', 'value': '[\"https://doi.org/10.1038/sdata.2018.22\", \"test_isreferencedby\"]'}, ] @@ -55,9 +55,14 @@ def test_graph_from_dataset(self): dataset_ref = s.graph_from_dataset(dataset) + # Standard values + assert self._triple( + g, dataset_ref, DCAT.temporalResolution, extras["temporal_resolution"], + data_type=XSD.duration + ) + # List for item in [ - ('temporal_resolution', DCAT.temporalResolution, [Literal, Literal], [XSD.duration, XSD.duration]), ('is_referenced_by', DCT.isReferencedBy, [URIRef, Literal], [None, None]), ]: values = json.loads(extras[item[0]]) diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index d8770e2f..255fc0ee 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -1,5 +1,6 @@ from unittest import mock import json +from decimal import Decimal import pytest from rdflib.namespace import RDF @@ -101,7 +102,7 @@ def test_e2e_ckan_to_dcat(self): {"start": "1905-03-01", "end": "2013-01-05"}, {"start": "2024-04-10", "end": "2024-05-29"}, ], - "temporal_resolution": ["PT15M", "P1D"], + "temporal_resolution": "PT15M", "spatial_coverage": [ { "geom": { @@ -211,6 +212,13 @@ def test_e2e_ckan_to_dcat(self): dataset["modified"], data_type=XSD.date, ) + assert self._triple( + g, + dataset_ref, + DCAT.temporalResolution, + dataset["temporal_resolution"], + data_type=XSD.duration, + ) # List fields @@ -231,10 +239,6 @@ def test_e2e_ckan_to_dcat(self): self._triples_list_values(g, dataset_ref, FOAF.page) == dataset["documentation"] ) - assert ( - self._triples_list_values(g, dataset_ref, DCAT.temporalResolution) - == dataset["temporal_resolution"] - ) assert ( self._triples_list_values(g, dataset_ref, DCT.isReferencedBy) == dataset["is_referenced_by"] @@ -402,7 +406,7 @@ def test_e2e_ckan_to_dcat(self): ) assert self._triple( - g, distribution_ref, DCAT.byteSize, float(resource["size"]), XSD.decimal + g, distribution_ref, DCAT.byteSize, Decimal(resource["size"]), XSD.decimal ) # Checksum checksum = self._triple(g, distribution_ref, SPDX.checksum, None)[2] @@ -636,6 +640,7 @@ def test_e2e_dcat_to_ckan(self): assert dataset["issued"] == u"2012-05-10" assert dataset["modified"] == u"2012-05-10T21:04:00" + assert dataset["temporal_resolution"] == "PT15M" # List fields assert sorted(dataset["conforms_to"]) == ["Standard 1", "Standard 2"] @@ -653,10 +658,6 @@ def test_e2e_dcat_to_ckan(self): "http://dataset.info.org/doc1", "http://dataset.info.org/doc2", ] - assert sorted(dataset["temporal_resolution"]) == [ - "P1D", - "PT15M", - ] assert sorted(dataset["spatial_resolution_in_meters"]) == [ 1.5, 2.0, diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index c10e8fe3..0c281763 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -72,9 +72,8 @@ def test_validate_dcat_ap_2(): ], "temporal_coverage": [ {"start": "1905-03-01", "end": "2013-01-05"}, - {"start": "2024-04-10", "end": "2024-05-29"}, ], - "temporal_resolution": ["PT15M", "P1D"], + "temporal_resolution": "PT15M", "spatial_coverage": [ { "geom": { @@ -151,5 +150,4 @@ def test_validate_dcat_ap_2(): ) r = validate(g, shacl_graph=path) conforms, results_graph, results_text = r - assert conforms, results_text diff --git a/examples/dataset.rdf b/examples/dataset.rdf index f7db02db..e9ea7979 100644 --- a/examples/dataset.rdf +++ b/examples/dataset.rdf @@ -64,7 +64,6 @@ PT15M - P1D Point of Contact From 715daa76bff7961e84c3c4b359756a786105d0a0 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 18 Jun 2024 17:22:39 +0200 Subject: [PATCH 04/18] Cast spatial resolution as an actual Decimal, not a float --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index ada96d0a..e676d529 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -1,4 +1,5 @@ import json +from decimal import Decimal from rdflib import URIRef, BNode, Literal from ckanext.dcat.utils import resource_uri @@ -222,7 +223,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ( dataset_ref, DCAT.spatialResolutionInMeters, - Literal(float(value), datatype=XSD.decimal), + Literal(Decimal(value), datatype=XSD.decimal), ) ) except (ValueError, TypeError): From c9dea8dc4af014970bddf8e2aa88dfb00b8b8a7a Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Jun 2024 14:02:23 +0200 Subject: [PATCH 05/18] Only one value of DCAT.spatialResolutionInMeters allowed in DCAT-AP 2.1.1 --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 16 ++++++++---- ckanext/dcat/schemas/dcat_ap_2.1_full.yaml | 2 -- .../tests/test_euro_dcatap_2_profile_parse.py | 4 +-- .../test_euro_dcatap_2_profile_serialize.py | 21 ++++++--------- .../test_euro_dcatap_profile_serialize.py | 3 ++- ckanext/dcat/tests/test_scheming_support.py | 26 +++++++++---------- ckanext/dcat/tests/test_shacl.py | 2 +- examples/dataset.rdf | 1 - 8 files changed, 36 insertions(+), 39 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index e676d529..467f0504 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -1,5 +1,5 @@ import json -from decimal import Decimal +from decimal import Decimal, DecimalException from rdflib import URIRef, BNode, Literal from ckanext.dcat.utils import resource_uri @@ -59,14 +59,20 @@ def parse_dataset(self, dataset_dict, dataset_ref): self._add_spatial_to_dict(dataset_dict, key, spatial) # Spatial resolution in meters - spatial_resolution_in_meters = self._object_value_float_list( + spatial_resolution = self._object_value_float_list( dataset_ref, DCAT.spatialResolutionInMeters ) - if spatial_resolution_in_meters: + if spatial_resolution: + # For some reason we incorrectly allowed lists in this property at some point + # keep support for it but default to single value + value = ( + spatial_resolution[0] if len(spatial_resolution) == 1 + else json.dumps(spatial_resolution) + ) dataset_dict["extras"].append( { "key": "spatial_resolution_in_meters", - "value": json.dumps(spatial_resolution_in_meters), + "value": value, } ) @@ -226,7 +232,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): Literal(Decimal(value), datatype=XSD.decimal), ) ) - except (ValueError, TypeError): + except (ValueError, TypeError, DecimalException): self.g.add( (dataset_ref, DCAT.spatialResolutionInMeters, Literal(value)) ) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml index 2761c8a2..051ee4c4 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -165,8 +165,6 @@ dataset_fields: - field_name: spatial_resolution_in_meters label: Spatial resolution in meters - preset: multiple_text - validators: ignore_missing scheming_multiple_number help_text: Minimum spatial separation resolvable in a dataset, measured in meters. - field_name: access_rights diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py index 446f69e6..4ce4a2de 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_parse.py @@ -108,9 +108,7 @@ def test_dataset_all_fields(self): assert extras['temporal_resolution'] == temporal_resolution - spatial_resolution_list = json.loads(extras['spatial_resolution_in_meters']) - assert len(spatial_resolution_list) == 1 - assert spatial_resolution_in_meters in spatial_resolution_list + assert extras['spatial_resolution_in_meters'] == spatial_resolution_in_meters isreferencedby_list = json.loads(extras['is_referenced_by']) assert len(isreferencedby_list) == 1 diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py index 89e2e142..b51a394f 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py @@ -3,6 +3,7 @@ from builtins import str from builtins import object import json +from decimal import Decimal import six import pytest @@ -43,7 +44,7 @@ def test_graph_from_dataset(self): 'metadata_modified': '2021-06-21T15:21:09.075774', 'extras': [ {'key': 'temporal_resolution', 'value': 'PT15M'}, - {'key': 'spatial_resolution_in_meters', 'value': '[30,20]'}, + {'key': 'spatial_resolution_in_meters', 'value': '30'}, {'key': 'is_referenced_by', 'value': '[\"https://doi.org/10.1038/sdata.2018.22\", \"test_isreferencedby\"]'}, ] } @@ -77,11 +78,8 @@ def test_graph_from_dataset(self): assert self._triple(g, dataset_ref, item[1], _type(value), _datatype) # Spatial Resolution in Meters - values = json.loads(extras['spatial_resolution_in_meters']) - assert len([t for t in g.triples((dataset_ref, DCAT.spatialResolutionInMeters, None))]) == len(values) - - for value in values: - assert self._triple(g, dataset_ref, DCAT.spatialResolutionInMeters, Literal(float(value), + value = extras['spatial_resolution_in_meters'] + assert self._triple(g, dataset_ref, DCAT.spatialResolutionInMeters, Literal(Decimal(value), datatype=XSD.decimal)) def test_spatial_resolution_in_meters_single_value(self): @@ -109,7 +107,7 @@ def test_spatial_resolution_in_meters_single_value(self): assert len([t for t in g.triples((dataset_ref, DCAT.spatialResolutionInMeters, None))]) == 1 assert self._triple(g, dataset_ref, DCAT.spatialResolutionInMeters, - Literal(float(extras['spatial_resolution_in_meters']), datatype=XSD.decimal)) + Literal(Decimal(extras['spatial_resolution_in_meters']), datatype=XSD.decimal)) def test_spatial_resolution_in_meters_a_value_is_not_a_number(self): @@ -123,7 +121,7 @@ def test_spatial_resolution_in_meters_a_value_is_not_a_number(self): 'metadata_created': '2021-06-21T15:21:09.034694', 'metadata_modified': '2021-06-21T15:21:09.075774', 'extras': [ - {'key': 'spatial_resolution_in_meters', 'value': '[\"foo\",20]'} + {'key': 'spatial_resolution_in_meters', 'value': 'foo'} ] } @@ -134,11 +132,8 @@ def test_spatial_resolution_in_meters_a_value_is_not_a_number(self): dataset_ref = s.graph_from_dataset(dataset) - values = json.loads(extras['spatial_resolution_in_meters']) - assert len([t for t in g.triples((dataset_ref, DCAT.spatialResolutionInMeters, None))]) == len(values) - assert self._triple(g, dataset_ref, DCAT.spatialResolutionInMeters, Literal(values[0])) - assert self._triple(g, dataset_ref, DCAT.spatialResolutionInMeters, - Literal(float(values[1]), datatype=XSD.decimal)) + value = extras['spatial_resolution_in_meters'] + assert self._triple(g, dataset_ref, DCAT.spatialResolutionInMeters, Literal(value)) def test_spatial_resolution_value_is_invalid_json(self): diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index edec0c5a..ef735563 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -2,6 +2,7 @@ from builtins import object import json import uuid +from decimal import Decimal import pytest @@ -702,7 +703,7 @@ def test_distribution_fields(self): assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime) # Numbers - assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal) + assert self._triple(g, distribution, DCAT.byteSize, Decimal(resource['size']), XSD.decimal) # Checksum checksum = self._triple(g, distribution, SPDX.checksum, None)[2] diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 255fc0ee..440e4928 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -134,7 +134,7 @@ def test_e2e_ckan_to_dcat(self): "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, } ], - "spatial_resolution_in_meters": [1.5, 2.0], + "spatial_resolution_in_meters": 1.5, "resources": [ { "name": "Resource 1", @@ -196,6 +196,13 @@ def test_e2e_ckan_to_dcat(self): assert self._triple(g, dataset_ref, DCT.type, dataset["dcat_type"]) assert self._triple(g, dataset_ref, ADMS.versionNotes, dataset["version_notes"]) assert self._triple(g, dataset_ref, DCT.accessRights, dataset["access_rights"]) + assert self._triple( + g, + dataset_ref, + DCAT.spatialResolutionInMeters, + dataset["spatial_resolution_in_meters"], + data_type=XSD.decimal, + ) # Dates assert self._triple( @@ -248,13 +255,6 @@ def test_e2e_ckan_to_dcat(self): == dataset["applicable_legislation"] ) - assert ( - self._triples_list_python_values( - g, dataset_ref, DCAT.spatialResolutionInMeters - ) - == dataset["spatial_resolution_in_meters"] - ) - # Repeating subfields contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] @@ -373,7 +373,9 @@ def test_e2e_ckan_to_dcat(self): # Resources: standard fields assert self._triple(g, distribution_ref, DCT.rights, resource["rights"]) - assert self._triple(g, distribution_ref, ADMS.status, URIRef(resource["status"])) + assert self._triple( + g, distribution_ref, ADMS.status, URIRef(resource["status"]) + ) assert self._triple( g, distribution_ref, @@ -641,6 +643,7 @@ def test_e2e_dcat_to_ckan(self): assert dataset["issued"] == u"2012-05-10" assert dataset["modified"] == u"2012-05-10T21:04:00" assert dataset["temporal_resolution"] == "PT15M" + assert dataset["spatial_resolution_in_meters"] == "1.5" # List fields assert sorted(dataset["conforms_to"]) == ["Standard 1", "Standard 2"] @@ -658,10 +661,7 @@ def test_e2e_dcat_to_ckan(self): "http://dataset.info.org/doc1", "http://dataset.info.org/doc2", ] - assert sorted(dataset["spatial_resolution_in_meters"]) == [ - 1.5, - 2.0, - ] + assert sorted(dataset["is_referenced_by"]) == [ "https://doi.org/10.1038/sdata.2018.22", "test_isreferencedby", diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index 0c281763..caa463e5 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -105,7 +105,7 @@ def test_validate_dcat_ap_2(): "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, } ], - "spatial_resolution_in_meters": [1.5, 2.0], + "spatial_resolution_in_meters": 1.5, "resources": [ { "name": "Resource 1", diff --git a/examples/dataset.rdf b/examples/dataset.rdf index e9ea7979..b2f925c8 100644 --- a/examples/dataset.rdf +++ b/examples/dataset.rdf @@ -38,7 +38,6 @@ 1.5 - 2.0 public From 83d1b6582870937fc5258d4977dcdc3c9332e4e7 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 3 Jul 2024 15:52:07 +0200 Subject: [PATCH 06/18] Output geometries as WKT only by default The Shacl validation expects only one item of LOCN.geometry, DCAT.bbox or DCAT.centroid. Up until now we were adding two triples, one for GeoJOSN and one for WKT. We'll default to WKT from now on as this is what GeoDCAT-AP requires (or GML yuk...), but sites that for some reason require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` to choose which format to use. --- ckanext/dcat/profiles/base.py | 53 +++++--- .../test_euro_dcatap_2_profile_serialize.py | 113 ++++++++++++++---- .../test_euro_dcatap_profile_serialize.py | 15 +-- ckanext/dcat/tests/test_scheming_support.py | 10 +- 4 files changed, 127 insertions(+), 64 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index d1ff561b..0b307bdd 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -7,7 +7,7 @@ from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS from geomet import wkt, InvalidGeoJSONException -from ckantoolkit import config, url_for, asbool, get_action, ObjectNotFound +from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound from ckan.model.license import LicenseRegister from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS @@ -46,6 +46,8 @@ GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json" +DEFAULT_SPATIAL_FORMATS = ["wkt"] + ROOT_DATASET_FIELDS = [ 'name', 'title', @@ -728,26 +730,41 @@ def _read_list_value(self, value): def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): """ - Adds spatial triples to the graph. + Adds spatial triples to the graph. Assumes that value is a GeoJSON string + or object. """ - # GeoJSON - self.g.add((spatial_ref, predicate, Literal(value, datatype=GEOJSON_IMT))) - # WKT, because GeoDCAT-AP says so - try: - if isinstance(value, str): + spatial_formats = aslist( + config.get( + "ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS + ) + ) + + if isinstance(value, str): + try: value = json.loads(value) - self.g.add( - ( - spatial_ref, - predicate, - Literal( - wkt.dumps(value, decimals=4), - datatype=GSP.wktLiteral, - ), + except (TypeError, ValueError): + return + + if "wkt" in spatial_formats: + # WKT, because GeoDCAT-AP says so + try: + self.g.add( + ( + spatial_ref, + predicate, + Literal( + wkt.dumps(value, decimals=4), + datatype=GSP.wktLiteral, + ), + ) ) - ) - except (TypeError, ValueError, InvalidGeoJSONException) as e: - pass + except (TypeError, ValueError, InvalidGeoJSONException): + pass + + if "geojson" in spatial_formats: + # GeoJSON + self.g.add((spatial_ref, predicate, Literal(json.dumps(value), datatype=GEOJSON_IMT))) + def _add_spatial_to_dict(self, dataset_dict, key, spatial): if spatial.get(key): diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py index b51a394f..1aeec82f 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py @@ -188,13 +188,89 @@ def test_spatial(self): assert self._triple(g, spatial, RDF.type, DCT.Location) assert self._triple(g, spatial, SKOS.prefLabel, extras['spatial_text']) + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 + assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 1 + assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 1 + + # Geometry in WKT + wkt_geom = wkt.dumps(json.loads(extras['spatial']), decimals=4) + assert self._triple(g, spatial, LOCN.geometry, wkt_geom, GSP.wktLiteral) + wkt_bbox = wkt.dumps(json.loads(extras['spatial_bbox']), decimals=4) + assert self._triple(g, spatial, DCAT.bbox, wkt_bbox, GSP.wktLiteral) + wkt_cent = wkt.dumps(json.loads(extras['spatial_centroid']), decimals=4) + assert self._triple(g, spatial, DCAT.centroid, wkt_cent, GSP.wktLiteral) + + @pytest.mark.ckan_config("ckanext.dcat.output_spatial_format", "geojson") + def test_spatial_geojson(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'spatial_uri', 'value': 'http://sws.geonames.org/6361390/'}, + {'key': 'spatial_text', 'value': 'Tarragona'}, + {'key': 'spatial', 'value': '{"type": "Polygon", "coordinates": [[[1.1870606,41.0786393],[1.1870606,41.1655218],[1.3752339,41.1655218],[1.3752339,41.0786393],[1.1870606,41.0786393]]]}'}, + {'key': 'spatial_bbox', 'value': '{"type": "Polygon", "coordinates": [[[2.1870606,42.0786393],[2.1870606,42.1655218],[2.3752339,42.1655218],[2.3752339,42.0786393],[2.1870606,42.0786393]]]}'}, + {'key': 'spatial_centroid', 'value': '{"type": "Point", "coordinates": [2.28114725,42.12208055]}'}, + + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=DCAT_AP_PROFILES) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] + assert spatial + assert str(spatial) == extras['spatial_uri'] + assert self._triple(g, spatial, RDF.type, DCT.Location) + assert self._triple(g, spatial, SKOS.prefLabel, extras['spatial_text']) + + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 + assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 1 + assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 1 + + # Geometry in GeoJSON (load and dump to match the formatting) + assert self._triple(g, spatial, LOCN.geometry, json.dumps(json.loads(extras['spatial'])), GEOJSON_IMT) + assert self._triple(g, spatial, DCAT.bbox, json.dumps(json.loads(extras['spatial_bbox'])), GEOJSON_IMT) + assert self._triple(g, spatial, DCAT.centroid, json.dumps(json.loads(extras['spatial_centroid'])), GEOJSON_IMT) + + @pytest.mark.ckan_config("ckanext.dcat.output_spatial_format", "wkt geojson") + def test_spatial_two_formats_legacy(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'spatial_uri', 'value': 'http://sws.geonames.org/6361390/'}, + {'key': 'spatial_text', 'value': 'Tarragona'}, + {'key': 'spatial', 'value': '{"type": "Polygon", "coordinates": [[[1.1870606,41.0786393],[1.1870606,41.1655218],[1.3752339,41.1655218],[1.3752339,41.0786393],[1.1870606,41.0786393]]]}'}, + {'key': 'spatial_bbox', 'value': '{"type": "Polygon", "coordinates": [[[2.1870606,42.0786393],[2.1870606,42.1655218],[2.3752339,42.1655218],[2.3752339,42.0786393],[2.1870606,42.0786393]]]}'}, + {'key': 'spatial_centroid', 'value': '{"type": "Point", "coordinates": [2.28114725,42.12208055]}'}, + + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=DCAT_AP_PROFILES) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] + assert spatial + assert str(spatial) == extras['spatial_uri'] + assert self._triple(g, spatial, RDF.type, DCT.Location) + assert self._triple(g, spatial, SKOS.prefLabel, extras['spatial_text']) + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 2 assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 2 assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 2 - # Geometry in GeoJSON - assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) - assert self._triple(g, spatial, DCAT.bbox, extras['spatial_bbox'], GEOJSON_IMT) - assert self._triple(g, spatial, DCAT.centroid, extras['spatial_centroid'], GEOJSON_IMT) + + # Geometry in GeoJSON (load and dump to match the formatting) + assert self._triple(g, spatial, LOCN.geometry, json.dumps(json.loads(extras['spatial'])), GEOJSON_IMT) + assert self._triple(g, spatial, DCAT.bbox, json.dumps(json.loads(extras['spatial_bbox'])), GEOJSON_IMT) + assert self._triple(g, spatial, DCAT.centroid, json.dumps(json.loads(extras['spatial_centroid'])), GEOJSON_IMT) # Geometry in WKT wkt_geom = wkt.dumps(json.loads(extras['spatial']), decimals=4) @@ -204,7 +280,7 @@ def test_spatial(self): wkt_cent = wkt.dumps(json.loads(extras['spatial_centroid']), decimals=4) assert self._triple(g, spatial, DCAT.centroid, wkt_cent, GSP.wktLiteral) - def test_spatial_bad_geojson_no_wkt(self): + def test_spatial_bad_geojson_no_location(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', @@ -215,7 +291,6 @@ def test_spatial_bad_geojson_no_wkt(self): ] } - extras = self._extras(dataset) s = RDFSerializer(profiles=DCAT_AP_PROFILES) g = s.g @@ -225,18 +300,13 @@ def test_spatial_bad_geojson_no_wkt(self): spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] assert spatial assert isinstance(spatial, BNode) - # Geometry in GeoJSON - assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) - assert self._triple(g, spatial, LOCN.geometry, extras['spatial_bbox'], GEOJSON_IMT) - assert self._triple(g, spatial, LOCN.geometry, extras['spatial_centroid'], GEOJSON_IMT) - # Geometry in WKT - assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 - assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 1 - assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 1 + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 0 + assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 0 + assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 0 - def test_spatial_bad_json_no_wkt(self): + def test_spatial_bad_json_no_location(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', @@ -247,7 +317,6 @@ def test_spatial_bad_json_no_wkt(self): ] } - extras = self._extras(dataset) s = RDFSerializer(profiles=DCAT_AP_PROFILES) g = s.g @@ -257,16 +326,10 @@ def test_spatial_bad_json_no_wkt(self): spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] assert spatial assert isinstance(spatial, BNode) - # Geometry in GeoJSON - assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) - assert self._triple(g, spatial, LOCN.geometry, extras['spatial_bbox'], GEOJSON_IMT) - assert self._triple(g, spatial, LOCN.geometry, extras['spatial_centroid'], GEOJSON_IMT) - # No Geometry in WKT, only one single triple for GeoJSON - assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 - # Always only one single triple - assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 1 - assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 1 + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 0 + assert len([t for t in g.triples((spatial, DCAT.bbox, None))]) == 0 + assert len([t for t in g.triples((spatial, DCAT.centroid, None))]) == 0 def test_temporal(self): """ diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index ef735563..73229253 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -531,9 +531,7 @@ def test_spatial(self): assert self._triple(g, spatial, RDF.type, DCT.Location) assert self._triple(g, spatial, SKOS.prefLabel, extras['spatial_text']) - assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 2 - # Geometry in GeoJSON - assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 # Geometry in WKT wkt_geom = wkt.dumps(json.loads(extras['spatial']), decimals=4) @@ -558,11 +556,7 @@ def test_spatial_bad_geojson_no_wkt(self): spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] assert spatial assert isinstance(spatial, BNode) - # Geometry in GeoJSON - assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) - - # Geometry in WKT - assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 0 def test_spatial_bad_json_no_wkt(self): dataset = { @@ -583,11 +577,8 @@ def test_spatial_bad_json_no_wkt(self): spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] assert spatial assert isinstance(spatial, BNode) - # Geometry in GeoJSON - assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) - # Geometry in WKT - assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 1 + assert len([t for t in g.triples((spatial, LOCN.geometry, None))]) == 0 def test_distributions(self): diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index d4f6eaee..ce452b53 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -344,15 +344,7 @@ def test_e2e_ckan_to_dcat(self): g, spatial[0][2], SKOS.prefLabel, dataset["spatial_coverage"][0]["text"] ) - assert len([t for t in g.triples((spatial[0][2], LOCN.geometry, None))]) == 2 - # Geometry in GeoJSON - assert self._triple( - g, - spatial[0][2], - LOCN.geometry, - dataset["spatial_coverage"][0]["geom"], - GEOJSON_IMT, - ) + assert len([t for t in g.triples((spatial[0][2], LOCN.geometry, None))]) == 1 # Geometry in WKT wkt_geom = wkt.dumps(dataset["spatial_coverage"][0]["geom"], decimals=4) assert self._triple(g, spatial[0][2], LOCN.geometry, wkt_geom, GSP.wktLiteral) From 79ece94c7f220824100b1c9b0c4307636a8a5e5d Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 3 Jul 2024 16:16:23 +0200 Subject: [PATCH 07/18] Use correct namespace for start and end date --- ckanext/dcat/profiles/euro_dcat_ap_scheming.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 12eb540e..5fdd4ced 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -180,9 +180,9 @@ def _not_empty_dict(data_dict): temporal_ref = BNode() self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime)) if item.get("start"): - self._add_date_triple(temporal_ref, SCHEMA.startDate, item["start"]) + self._add_date_triple(temporal_ref, DCAT.startDate, item["start"]) if item.get("end"): - self._add_date_triple(temporal_ref, SCHEMA.endDate, item["end"]) + self._add_date_triple(temporal_ref, DCAT.endDate, item["end"]) self.g.add((dataset_ref, DCT.temporal, temporal_ref)) spatial = dataset_dict.get("spatial_coverage") From 58c24fb236b955f6e3e121b2b5a300e49f881f6d Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 3 Jul 2024 16:21:35 +0200 Subject: [PATCH 08/18] Add recommended properties to access service schema --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 2 +- ckanext/dcat/schemas/dcat_ap_2.1_full.yaml | 8 ++++++++ ckanext/dcat/tests/test_shacl.py | 9 ++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 467f0504..f31ff5b4 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -291,7 +291,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ("license", DCT.license, None, URIRefOrLiteral), ("access_rights", DCT.accessRights, None, URIRefOrLiteral), ("title", DCT.title, None, Literal), - ("endpoint_description", DCAT.endpointDescription, None, Literal), + ("endpoint_description", DCAT.endpointDescription, None, URIRefOrLiteral), ("description", DCT.description, None, Literal), ] diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml index 4f9e3e09..94998ea0 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -364,9 +364,17 @@ resource_fields: - field_name: title label: Title + - field_name: endpoint_description + label: Endpoint description + - field_name: endpoint_url label: Endpoint URL preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + help_text: A data service that gives access to the resource. # Note: if not provided, this will be autogenerated diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index caa463e5..d4d31ef5 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -129,10 +129,15 @@ def test_validate_dcat_ap_2(): "access_services": [ { "title": "Access Service 1", + "endpoint_description": "https://example.org/endpoint_description", "endpoint_url": [ "https://example.org/access_service/1", "https://example.org/access_service/2", ], + "serves_dataset": [ + "https://example.org/dataset/1", + "https://example.org/dataset/2", + ] } ], } @@ -146,7 +151,9 @@ def test_validate_dcat_ap_2(): s.graph_from_dataset(dataset) path = os.path.join( - os.path.dirname(__file__), "shacl", "dcat-ap_2.1.1_shacl_shapes.ttl" + #os.path.dirname(__file__), "shacl", "dcat-ap_2.1.1_shacl_range.ttl" + + os.path.dirname(__file__), "shacl", "dcat-ap_2.1.1_shacl_shapes_recommended.ttl" ) r = validate(g, shacl_graph=path) conforms, results_graph, results_text = r From bd3b64ce4e5f77fd4dd5e1a269a7496fdee2618e Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 4 Jul 2024 11:10:45 +0200 Subject: [PATCH 09/18] Fix scheming tests --- ckanext/dcat/schemas/dcat_ap_2.1_full.yaml | 1 + ckanext/dcat/tests/test_scheming_support.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml index 94998ea0..8f9f4afc 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1_full.yaml @@ -374,6 +374,7 @@ resource_fields: - field_name: serves_dataset label: Serves dataset preset: multiple_text + validators: ignore_missing scheming_multiple_text help_text: A data service that gives access to the resource. diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index ce452b53..4eedf341 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -20,7 +20,6 @@ XSD, VCARD, FOAF, - SCHEMA, SKOS, LOCN, GSP, @@ -310,28 +309,28 @@ def test_e2e_ckan_to_dcat(self): assert self._triple( g, temporal[0][2], - SCHEMA.startDate, + DCAT.startDate, dataset_dict["temporal_coverage"][0]["start"], data_type=XSD.date, ) assert self._triple( g, temporal[0][2], - SCHEMA.endDate, + DCAT.endDate, dataset_dict["temporal_coverage"][0]["end"], data_type=XSD.date, ) assert self._triple( g, temporal[1][2], - SCHEMA.startDate, + DCAT.startDate, dataset_dict["temporal_coverage"][1]["start"], data_type=XSD.date, ) assert self._triple( g, temporal[1][2], - SCHEMA.endDate, + DCAT.endDate, dataset_dict["temporal_coverage"][1]["end"], data_type=XSD.date, ) @@ -617,7 +616,7 @@ def test_dcat_date(self): assert self._triple( g, temporal[0][2], - SCHEMA.endDate, + DCAT.endDate, dataset_dict["temporal_coverage"][0]["end"], data_type=XSD.date, ) @@ -630,7 +629,7 @@ def test_dcat_date(self): assert self._triple( g, temporal[0][2], - SCHEMA.startDate, + DCAT.startDate, dataset_dict["temporal_coverage"][0]["start"], data_type=XSD.dateTime, ) @@ -642,7 +641,7 @@ def test_dcat_date(self): assert self._triple( g, temporal[1][2], - SCHEMA.startDate, + DCAT.startDate, dataset_dict["temporal_coverage"][1]["start"], data_type=XSD.dateTime, ) @@ -655,7 +654,7 @@ def test_dcat_date(self): assert self._triple( g, temporal[2][2], - SCHEMA.startDate, + DCAT.startDate, "2024-11-24T00:00:00", data_type=XSD.dateTime, ) @@ -666,7 +665,7 @@ def test_dcat_date(self): assert self._triple( g, temporal[2][2], - SCHEMA.endDate, + DCAT.endDate, "2012-06-12T00:00:00", data_type=XSD.dateTime, ) From 3494963a8afb1fee656622cd0c649af9b87c7439 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 4 Jul 2024 11:12:01 +0200 Subject: [PATCH 10/18] Refactor tests, add shapes recommended check --- ...dcat-ap_2.1.1_shacl_shapes_recommended.ttl | 335 ++++++++++++++++++ ckanext/dcat/tests/test_shacl.py | 303 +++++++++------- 2 files changed, 505 insertions(+), 133 deletions(-) create mode 100644 ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes_recommended.ttl diff --git a/ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes_recommended.ttl b/ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes_recommended.ttl new file mode 100644 index 00000000..13630a19 --- /dev/null +++ b/ckanext/dcat/tests/shacl/dcat-ap_2.1.1_shacl_shapes_recommended.ttl @@ -0,0 +1,335 @@ +@prefix rdf: . +@prefix : . +@prefix adms: . +@prefix cc: . +@prefix dc: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix lcon: . +@prefix org: . +@prefix owl: . +@prefix odrl: . +@prefix prov: . +@prefix rdfs: . +@prefix schema: . +@prefix sh: . +@prefix skos: . +@prefix spdx: . +@prefix time: . +@prefix vcard: . +@prefix xsd: . +@prefix dcatap: . + + + dcatap:availability ; + dct:format ; + dct:conformsTo ; + dct:creator [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Bert Van Nuffelen" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Natasa Sofou" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Eugeniu Costetchi" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Makx Dekkers" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Nikolaos Loutas" + ], [ + rdfs:seeAlso ; + org:memberOf ; + foaf:homepage ; + foaf:name "Vassilios Peristeras" + ] ; + dct:license ; + cc:attributionURL ; + dct:publisher ; + dct:description "This document specifies the constraints on properties and classes expressed by DCAT-AP in SHACL."@en ; + dct:title "The constraints of DCAT Application Profile for Data Portals in Europe"@en ; + owl:versionInfo "2.1.1" ; + foaf:maker [ + foaf:mbox ; + foaf:name "DCAT-AP Working Group" ; + foaf:page , + ] . + + + +#------------------------------------------------------------------------- +# The shapes in this file cover all recommendations in DCAT-AP 2.1.1. +# +# +#------------------------------------------------------------------------- + +:Agent_Shape + a sh:NodeShape ; + sh:name "Agent"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dct:type ; + sh:severity sh:Warning + ] ; + sh:targetClass foaf:Agent . + +:CatalogRecord_Shape + a sh:NodeShape ; + sh:name "Catalog Record"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dct:conformsTo ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:issued ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path adms:status ; + sh:severity sh:Warning + ] ; + sh:targetClass dcat:CatalogRecord . + + +:Catalog_Shape + a sh:NodeShape ; + sh:name "Catalog"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dct:language ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:issued ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:license; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:spatial ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:modified ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcat:themeTaxonomy ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path foaf:homepage ; + sh:severity sh:Warning + ] ; + sh:targetClass dcat:Catalog . + +:Catalog_Shape2 + a sh:NodeShape ; + sh:name "Catalog"@en ; + sh:or ( + [ + sh:path dcat:dataset ; + sh:minCount 1 ; + ] + [ + sh:path dcat:service ; + sh:minCount 1 ; + ] + ) ; + sh:severity sh:Warning; + sh:targetClass dcat:Catalog . + + +# +# Outcommented because no constraint is present +# and according to the W3C spec sh:property expects at least a sh:path +# + +#:CategoryScheme_Shape +# a sh:NodeShape ; +# sh:name "Category Scheme"@en ; +# sh:property [ +# ] ; +# sh:targetClass skos:ConceptScheme . +# +#:Category_Shape +# a sh:NodeShape ; +# sh:name "Category"@en ; +# sh:property [ +# ] ; +# sh:targetClass skos:Concept . +# +#:Checksum_Shape +# a sh:NodeShape ; +# sh:name "Checksum"@en ; +# sh:property [ +# ] ; +# sh:targetClass spdx:Checksum . + +:DataService_Shape + a sh:NodeShape ; + sh:name "Data Service"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dcat:servesDataset ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcat:endpointDescription ; + sh:severity sh:Warning + ] ; + sh:targetClass dcat:DataService . + +:Dataset_Shape + a sh:NodeShape ; + sh:name "Dataset"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dcat:contactPoint ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcat:distribution ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcat:keyword ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:publisher ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:spatial ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:temporal ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcat:theme ; + sh:severity sh:Warning + ] ; + sh:targetClass dcat:Dataset . + +:DateOrDateTimeDataType_Shape + a sh:NodeShape ; + rdfs:comment "Date time date disjunction shape checks that a datatype property receives a date or a dateTime literal" ; + rdfs:label "Date time date disjunction" ; + sh:message "The values must be data typed as either xsd:date or xsd:dateTime" ; + sh:or ([ + sh:datatype xsd:date + ] + [ + sh:datatype xsd:dateTime + ] + ) . + +:DcatResource_Shape + a sh:NodeShape ; + rdfs:comment "the union of Catalog, Dataset and DataService" ; + rdfs:label "dcat:Resource" ; + sh:message "The node is either a Catalog, Dataset or a DataService" ; + sh:or ([ + sh:class dcat:Catalog + ] + [ + sh:class dcat:Dataset + ] + [ + sh:class dcat:DataService + ] + ) . + +:Distribution_Shape + a sh:NodeShape ; + sh:name "Distribution"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dct:description ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcatap:availability ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:format ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dct:license ; + sh:severity sh:Warning + ] ; + sh:targetClass dcat:Distribution . + +#:Identifier_Shape +# a sh:NodeShape ; +# sh:name "Identifier"@en ; +# sh:property [ +# ] ; +# sh:targetClass adms:Identifier . + +:LicenceDocument_Shape + a sh:NodeShape ; + sh:name "Licence Document"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dct:type ; + sh:severity sh:Warning + ] ; + sh:targetClass dct:LicenseDocument . + +:Location_Shape + a sh:NodeShape ; + sh:name "Location"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dcat:bbox ; + sh:severity sh:Warning + ], [ + sh:minCount 1 ; + sh:path dcat:centroid ; + sh:severity sh:Warning + ] ; + sh:targetClass dct:Location . + +:PeriodOfTime_Shape + a sh:NodeShape ; + sh:name "PeriodOfTime"@en ; + sh:property [ + sh:minCount 1 ; + sh:path dcat:endDate ; + sh:severity sh:Warning ; + ], [ + sh:minCount 1 ; + sh:path dcat:startDate ; + sh:severity sh:Warning ; + ] ; + sh:targetClass dct:PeriodOfTime . + +#:Relationship_Shape +# a sh:NodeShape ; +# sh:name "Relationship"@en ; +# sh:property [ +# ] ; +# sh:targetClass dcat:Relationship . + diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index d4d31ef5..92cc6dc0 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -8,7 +8,152 @@ from ckanext.dcat.processors import RDFSerializer -@pytest.mark.usefixtures("with_plugins", "clean_db") +dataset_dict = { + # Core fields + "name": "test-dataset", + "title": "Test DCAT dataset", + "notes": "Lorem ipsum", + "url": "http://example.org/ds1", + "version": "1.0b", + "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], + # Standard fields + "issued": "2024-05-01", + "modified": "2024-05-05", + "identifier": "xx-some-dataset-id-yy", + "frequency": "monthly", + "provenance": "Statement about provenance", + "dcat_type": "test-type", + "version_notes": "Some version notes", + "access_rights": "Statement about access rights", + # List fields (lists) + "alternate_identifier": ["alt-id-1", "alt-id-2"], + "theme": [ + "https://example.org/uri/theme1", + "https://example.org/uri/theme2", + "https://example.org/uri/theme3", + ], + "language": ["en", "ca", "es"], + "documentation": ["https://example.org/some-doc.html"], + "conforms_to": ["Standard 1", "Standard 2"], + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2018.22", + "test_isreferencedby", + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg_impl/2023/138/oj", + "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", + ], + # Repeating subfields + "contact": [ + {"name": "Contact 1", "email": "contact1@example.org"}, + {"name": "Contact 2", "email": "contact2@example.org"}, + ], + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body", + }, + ], + "temporal_coverage": [ + {"start": "1905-03-01", "end": "2013-01-05"}, + ], + "temporal_resolution": "PT15M", + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [ + [ + [11.9936, 54.0486], + [11.9936, 54.2466], + [12.3045, 54.2466], + [12.3045, 54.0486], + [11.9936, 54.0486], + ] + ], + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [-2.1604, 42.7611], + [-2.0938, 42.7611], + [-2.0938, 42.7931], + [-2.1604, 42.7931], + [-2.1604, 42.7611], + ] + ], + }, + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + } + ], + "spatial_resolution_in_meters": 1.5, + "resources": [ + { + "name": "Resource 1", + "description": "Some description", + "url": "https://example.com/data.csv", + "format": "CSV", + "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", + "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", + "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", + "size": 12323, + "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", + "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", + "status": "http://purl.org/adms/status/Completed", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "license": "http://creativecommons.org/licenses/by/3.0/", + "rights": "Some stament about rights", + "language": ["en", "ca", "es"], + "access_services": [ + { + "title": "Access Service 1", + "endpoint_description": "https://example.org/endpoint_description", + "endpoint_url": [ + "https://example.org/access_service/1", + "https://example.org/access_service/2", + ], + "serves_dataset": [ + "https://example.org/dataset/1", + "https://example.org/dataset/2", + ], + } + ], + } + ], +} + + +def _get_shacl_file_path(file_name): + return os.path.join(os.path.dirname(__file__), "shacl", file_name) + + +generated_graph = None + + +@pytest.fixture +def graph(): + global generated_graph + + if not generated_graph: + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + s.graph_from_dataset(dataset) + + generated_graph = s.g + + return generated_graph + + +@pytest.mark.usefixtures("with_plugins") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" @@ -20,141 +165,33 @@ @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) -def test_validate_dcat_ap_2(): - - dataset_dict = { - # Core fields - "name": "test-dataset", - "title": "Test DCAT dataset", - "notes": "Lorem ipsum", - "url": "http://example.org/ds1", - "version": "1.0b", - "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], - # Standard fields - "issued": "2024-05-01", - "modified": "2024-05-05", - "identifier": "xx-some-dataset-id-yy", - "frequency": "monthly", - "provenance": "Statement about provenance", - "dcat_type": "test-type", - "version_notes": "Some version notes", - "access_rights": "Statement about access rights", - # List fields (lists) - "alternate_identifier": ["alt-id-1", "alt-id-2"], - "theme": [ - "https://example.org/uri/theme1", - "https://example.org/uri/theme2", - "https://example.org/uri/theme3", - ], - "language": ["en", "ca", "es"], - "documentation": ["https://example.org/some-doc.html"], - "conforms_to": ["Standard 1", "Standard 2"], - "is_referenced_by": [ - "https://doi.org/10.1038/sdata.2018.22", - "test_isreferencedby", - ], - "applicable_legislation": [ - "http://data.europa.eu/eli/reg_impl/2023/138/oj", - "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", - ], - # Repeating subfields - "contact": [ - {"name": "Contact 1", "email": "contact1@example.org"}, - {"name": "Contact 2", "email": "contact2@example.org"}, - ], - "publisher": [ - { - "name": "Test Publisher", - "email": "publisher@example.org", - "url": "https://example.org", - "type": "public_body", - }, - ], - "temporal_coverage": [ - {"start": "1905-03-01", "end": "2013-01-05"}, - ], - "temporal_resolution": "PT15M", - "spatial_coverage": [ - { - "geom": { - "type": "Polygon", - "coordinates": [ - [ - [11.9936, 54.0486], - [11.9936, 54.2466], - [12.3045, 54.2466], - [12.3045, 54.0486], - [11.9936, 54.0486], - ] - ], - }, - "text": "Tarragona", - "uri": "https://sws.geonames.org/6361390/", - "bbox": { - "type": "Polygon", - "coordinates": [ - [ - [-2.1604, 42.7611], - [-2.0938, 42.7611], - [-2.0938, 42.7931], - [-2.1604, 42.7931], - [-2.1604, 42.7611], - ] - ], - }, - "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, - } - ], - "spatial_resolution_in_meters": 1.5, - "resources": [ - { - "name": "Resource 1", - "description": "Some description", - "url": "https://example.com/data.csv", - "format": "CSV", - "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", - "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", - "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", - "size": 12323, - "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", - "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", - "status": "http://purl.org/adms/status/Completed", - "access_url": "https://example.com/data.csv", - "download_url": "https://example.com/data.csv", - "issued": "2024-05-01T01:20:33", - "modified": "2024-05-05T09:33:20", - "license": "http://creativecommons.org/licenses/by/3.0/", - "rights": "Some stament about rights", - "language": ["en", "ca", "es"], - "access_services": [ - { - "title": "Access Service 1", - "endpoint_description": "https://example.org/endpoint_description", - "endpoint_url": [ - "https://example.org/access_service/1", - "https://example.org/access_service/2", - ], - "serves_dataset": [ - "https://example.org/dataset/1", - "https://example.org/dataset/2", - ] - } - ], - } - ], - } +def test_validate_dcat_ap_2_graph_shapes(graph): - dataset = call_action("package_create", **dataset_dict) + # dcat-ap_2.1.1_shacl_shapes.ttl: constraints concerning existance, domain and + # literal range, and cardinalities. + path = _get_shacl_file_path("dcat-ap_2.1.1_shacl_shapes.ttl") + r = validate(graph, shacl_graph=path) + conforms, results_graph, results_text = r + assert conforms, results_text - s = RDFSerializer() - g = s.g - s.graph_from_dataset(dataset) - path = os.path.join( - #os.path.dirname(__file__), "shacl", "dcat-ap_2.1.1_shacl_range.ttl" +@pytest.mark.usefixtures("with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_2.1_full.yaml" +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", +) +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +def test_validate_dcat_ap_2_graph_shapes_recommended(graph): - os.path.dirname(__file__), "shacl", "dcat-ap_2.1.1_shacl_shapes_recommended.ttl" - ) - r = validate(g, shacl_graph=path) + # dcat-ap_2.1.1_shacl_shapes_recommended.ttl: constraints concerning existance + # of recommended properties. + path = _get_shacl_file_path("dcat-ap_2.1.1_shacl_shapes_recommended.ttl") + r = validate(graph, shacl_graph=path) conforms, results_graph, results_text = r assert conforms, results_text From f8ab3bfe471763d1ac0c48fdc073273158b49efa Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 4 Jul 2024 11:21:24 +0200 Subject: [PATCH 11/18] Dataset name --- ckanext/dcat/tests/test_shacl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index 92cc6dc0..fe1f91d5 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -10,7 +10,7 @@ dataset_dict = { # Core fields - "name": "test-dataset", + "name": "test-dataset-shacl", "title": "Test DCAT dataset", "notes": "Lorem ipsum", "url": "http://example.org/ds1", From f4e04d58d7a4782818ea261bf055f401660df98a Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 5 Jul 2024 11:57:20 +0200 Subject: [PATCH 12/18] Use random name --- ckanext/dcat/tests/test_shacl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index fe1f91d5..6478e8cf 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -1,4 +1,5 @@ import os +from random import randrange from pyshacl import validate import pytest @@ -143,6 +144,7 @@ def graph(): global generated_graph if not generated_graph: + dataset_dict["name"] += "-" + str(randrange(0, 1000)) dataset = call_action("package_create", **dataset_dict) s = RDFSerializer() From 18274833a1c91e0c71d39e79151e477d8a2b57e8 Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 5 Jul 2024 12:15:19 +0200 Subject: [PATCH 13/18] Fix 2.9 tests --- .github/workflows/test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ea46d262..5f1f5d0d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,7 +17,7 @@ jobs: needs: lint strategy: matrix: - ckan-version: ["2.11", "2.10", 2.9] + ckan-version: ["2.11", "2.10", "2.9"] fail-fast: false name: CKAN ${{ matrix.ckan-version }} @@ -45,6 +45,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install requirements (2.9) + run: pip install --ignore-installed packaging>=21.3 + if: ${{ matrix.ckan-version == "2.9" }} - name: Install requirements run: | pip install -r requirements.txt @@ -67,4 +70,3 @@ jobs: ckan -c test.ini db pending-migrations --apply - name: Run tests run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests - From 96e8bf15489fde8bc80f1dda6b324c07409e62f5 Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 5 Jul 2024 12:19:49 +0200 Subject: [PATCH 14/18] Syntax --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f1f5d0d..1708ea10 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: - uses: actions/checkout@v4 - name: Install requirements (2.9) run: pip install --ignore-installed packaging>=21.3 - if: ${{ matrix.ckan-version == "2.9" }} + if: ${{ matrix.ckan-version == '2.9' }} - name: Install requirements run: | pip install -r requirements.txt From a71b5ab11eac7da977c908e868df0896f8f3dccd Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 12 Jul 2024 15:25:53 +0200 Subject: [PATCH 15/18] Move dataset dict to its own file --- ckanext/dcat/tests/test_shacl.py | 149 ++------------ .../ckan/ckan_full_dataset_dcat_ap_2.json | 185 ++++++++++++++++++ 2 files changed, 202 insertions(+), 132 deletions(-) create mode 100644 examples/ckan/ckan_full_dataset_dcat_ap_2.json diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index 6478e8cf..d5506252 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -1,3 +1,4 @@ +import json import os from random import randrange @@ -7,152 +8,32 @@ from ckan.tests.helpers import call_action from ckanext.dcat.processors import RDFSerializer - - -dataset_dict = { - # Core fields - "name": "test-dataset-shacl", - "title": "Test DCAT dataset", - "notes": "Lorem ipsum", - "url": "http://example.org/ds1", - "version": "1.0b", - "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], - # Standard fields - "issued": "2024-05-01", - "modified": "2024-05-05", - "identifier": "xx-some-dataset-id-yy", - "frequency": "monthly", - "provenance": "Statement about provenance", - "dcat_type": "test-type", - "version_notes": "Some version notes", - "access_rights": "Statement about access rights", - # List fields (lists) - "alternate_identifier": ["alt-id-1", "alt-id-2"], - "theme": [ - "https://example.org/uri/theme1", - "https://example.org/uri/theme2", - "https://example.org/uri/theme3", - ], - "language": ["en", "ca", "es"], - "documentation": ["https://example.org/some-doc.html"], - "conforms_to": ["Standard 1", "Standard 2"], - "is_referenced_by": [ - "https://doi.org/10.1038/sdata.2018.22", - "test_isreferencedby", - ], - "applicable_legislation": [ - "http://data.europa.eu/eli/reg_impl/2023/138/oj", - "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", - ], - # Repeating subfields - "contact": [ - {"name": "Contact 1", "email": "contact1@example.org"}, - {"name": "Contact 2", "email": "contact2@example.org"}, - ], - "publisher": [ - { - "name": "Test Publisher", - "email": "publisher@example.org", - "url": "https://example.org", - "type": "public_body", - }, - ], - "temporal_coverage": [ - {"start": "1905-03-01", "end": "2013-01-05"}, - ], - "temporal_resolution": "PT15M", - "spatial_coverage": [ - { - "geom": { - "type": "Polygon", - "coordinates": [ - [ - [11.9936, 54.0486], - [11.9936, 54.2466], - [12.3045, 54.2466], - [12.3045, 54.0486], - [11.9936, 54.0486], - ] - ], - }, - "text": "Tarragona", - "uri": "https://sws.geonames.org/6361390/", - "bbox": { - "type": "Polygon", - "coordinates": [ - [ - [-2.1604, 42.7611], - [-2.0938, 42.7611], - [-2.0938, 42.7931], - [-2.1604, 42.7931], - [-2.1604, 42.7611], - ] - ], - }, - "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, - } - ], - "spatial_resolution_in_meters": 1.5, - "resources": [ - { - "name": "Resource 1", - "description": "Some description", - "url": "https://example.com/data.csv", - "format": "CSV", - "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", - "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", - "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", - "size": 12323, - "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", - "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", - "status": "http://purl.org/adms/status/Completed", - "access_url": "https://example.com/data.csv", - "download_url": "https://example.com/data.csv", - "issued": "2024-05-01T01:20:33", - "modified": "2024-05-05T09:33:20", - "license": "http://creativecommons.org/licenses/by/3.0/", - "rights": "Some stament about rights", - "language": ["en", "ca", "es"], - "access_services": [ - { - "title": "Access Service 1", - "endpoint_description": "https://example.org/endpoint_description", - "endpoint_url": [ - "https://example.org/access_service/1", - "https://example.org/access_service/2", - ], - "serves_dataset": [ - "https://example.org/dataset/1", - "https://example.org/dataset/2", - ], - } - ], - } - ], -} +from ckanext.dcat.tests.utils import get_file_contents def _get_shacl_file_path(file_name): return os.path.join(os.path.dirname(__file__), "shacl", file_name) -generated_graph = None +generated_graphs = {} -@pytest.fixture -def graph(): - global generated_graph +def graph_from_dataset(file_name): + global generated_graphs - if not generated_graph: + if not generated_graphs.get(file_name): + if not file_name.startswith("ckan/"): + file_name = "ckan/" + file_name + dataset_dict = json.loads(get_file_contents(file_name)) dataset_dict["name"] += "-" + str(randrange(0, 1000)) dataset = call_action("package_create", **dataset_dict) s = RDFSerializer() s.graph_from_dataset(dataset) - generated_graph = s.g + generated_graphs[file_name] = s.g - return generated_graph + return generated_graphs[file_name] @pytest.mark.usefixtures("with_plugins") @@ -167,7 +48,9 @@ def graph(): @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) -def test_validate_dcat_ap_2_graph_shapes(graph): +def test_validate_dcat_ap_2_graph_shapes(): + + graph = graph_from_dataset("ckan_full_dataset_dcat_ap_2.json") # dcat-ap_2.1.1_shacl_shapes.ttl: constraints concerning existance, domain and # literal range, and cardinalities. @@ -189,7 +72,9 @@ def test_validate_dcat_ap_2_graph_shapes(graph): @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) -def test_validate_dcat_ap_2_graph_shapes_recommended(graph): +def test_validate_dcat_ap_2_graph_shapes_recommended(): + + graph = graph_from_dataset("ckan_full_dataset_dcat_ap_2.json") # dcat-ap_2.1.1_shacl_shapes_recommended.ttl: constraints concerning existance # of recommended properties. diff --git a/examples/ckan/ckan_full_dataset_dcat_ap_2.json b/examples/ckan/ckan_full_dataset_dcat_ap_2.json new file mode 100644 index 00000000..381804a1 --- /dev/null +++ b/examples/ckan/ckan_full_dataset_dcat_ap_2.json @@ -0,0 +1,185 @@ +{ + "name": "test-dataset-shacl", + "title": "Test DCAT dataset", + "notes": "Lorem ipsum", + "url": "http://example.org/ds1", + "version": "1.0b", + "tags": [ + { + "name": "Tag 1" + }, + { + "name": "Tag 2" + } + ], + "issued": "2024-05-01", + "modified": "2024-05-05", + "identifier": "xx-some-dataset-id-yy", + "frequency": "monthly", + "provenance": "Statement about provenance", + "dcat_type": "test-type", + "version_notes": "Some version notes", + "access_rights": "Statement about access rights", + "alternate_identifier": [ + "alt-id-1", + "alt-id-2" + ], + "theme": [ + "https://example.org/uri/theme1", + "https://example.org/uri/theme2", + "https://example.org/uri/theme3" + ], + "language": [ + "en", + "ca", + "es" + ], + "documentation": [ + "https://example.org/some-doc.html" + ], + "conforms_to": [ + "Standard 1", + "Standard 2" + ], + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2018.22", + "test_isreferencedby" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg_impl/2023/138/oj", + "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt" + ], + "contact": [ + { + "name": "Contact 1", + "email": "contact1@example.org" + }, + { + "name": "Contact 2", + "email": "contact2@example.org" + } + ], + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body" + } + ], + "temporal_coverage": [ + { + "start": "1905-03-01", + "end": "2013-01-05" + } + ], + "temporal_resolution": "PT15M", + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [ + [ + [ + 11.9936, + 54.0486 + ], + [ + 11.9936, + 54.2466 + ], + [ + 12.3045, + 54.2466 + ], + [ + 12.3045, + 54.0486 + ], + [ + 11.9936, + 54.0486 + ] + ] + ] + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [ + -2.1604, + 42.7611 + ], + [ + -2.0938, + 42.7611 + ], + [ + -2.0938, + 42.7931 + ], + [ + -2.1604, + 42.7931 + ], + [ + -2.1604, + 42.7611 + ] + ] + ] + }, + "centroid": { + "type": "Point", + "coordinates": [ + 1.26639, + 41.12386 + ] + } + } + ], + "spatial_resolution_in_meters": 1.5, + "resources": [ + { + "name": "Resource 1", + "description": "Some description", + "url": "https://example.com/data.csv", + "format": "CSV", + "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", + "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", + "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", + "size": 12323, + "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", + "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", + "status": "http://purl.org/adms/status/Completed", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "license": "http://creativecommons.org/licenses/by/3.0/", + "rights": "Some stament about rights", + "language": [ + "en", + "ca", + "es" + ], + "access_services": [ + { + "title": "Access Service 1", + "endpoint_description": "https://example.org/endpoint_description", + "endpoint_url": [ + "https://example.org/access_service/1", + "https://example.org/access_service/2" + ], + "serves_dataset": [ + "https://example.org/dataset/1", + "https://example.org/dataset/2" + ] + } + ] + } + ] +} From 35bfeff082cc2ac17430b92c2aa51e82552503a5 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 15 Jul 2024 12:09:29 +0200 Subject: [PATCH 16/18] Avoid duplicated dates in DCAT-AP 2 profile The profile for DCAT-AP 1 stored triples using schema:startDate/endDate but the namespace was updated to dcat:startDate/endDate in DCAT-AP 2. As the dcat-ap 2 profile calls the version 1 profile, the two sets of dates were added together. This makes fail the shapes recommended Shacl tests and it is incorrect anyway, so we now remove the schema-namespaced triples if present when using the dcat-ap 2 profile. --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 9 +++++++ .../test_euro_dcatap_2_profile_serialize.py | 27 ++++++------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index f31ff5b4..02c726d3 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -12,6 +12,7 @@ DCATAP, DCT, XSD, + SCHEMA, ) from .euro_dcat_ap import EuropeanDCATAPProfile @@ -191,6 +192,14 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) # Temporal + + # The profile for DCAT-AP 1 stored triples using schema:startDate, + # remove them to avoid duplication + for temporal in self.g.objects(dataset_ref, DCT.temporal): + if SCHEMA.startDate in [t for t in self.g.predicates(temporal, None)]: + self.g.remove((temporal, None, None)) + self.g.remove((dataset_ref, DCT.temporal, temporal)) + start = self._get_dataset_value(dataset_dict, "temporal_start") end = self._get_dataset_value(dataset_dict, "temporal_end") if start or end: diff --git a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py index 1aeec82f..816e9ace 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_2_profile_serialize.py @@ -333,7 +333,7 @@ def test_spatial_bad_json_no_location(self): def test_temporal(self): """ - Tests that the DCAT date properties are included in the graph in addition to schema.org dates. + Tests that the DCAT date properties are included in the graph """ dataset = { @@ -351,24 +351,13 @@ def test_temporal(self): dataset_ref = s.graph_from_dataset(dataset) - temporals = self._triples(g, dataset_ref, DCT.temporal, None) - assert temporals - assert len(temporals) == 2 - - assert len([self._triple(g, temporal[2] , RDF.type, DCT.PeriodOfTime) for temporal in temporals]) == 2 - - temporal_obj_list = [temporal[2] for temporal in temporals] - for predicate in [SCHEMA.startDate, DCAT.startDate]: - triples = [] - for temporal_obj in temporal_obj_list: - triples.extend(self._triples(g, temporal_obj, predicate, extras['temporal_start'], XSD.dateTime)) - assert len(triples) == 1 - - for predicate in [SCHEMA.endDate, DCAT.endDate]: - triples = [] - for temporal_obj in temporal_obj_list: - triples.extend(self._triples(g, temporal_obj, predicate, extras['temporal_end'], XSD.date)) - assert len(triples) == 1 + temporal = self._triples(g, dataset_ref, DCT.temporal, None) + assert temporal + assert len(temporal) == 1 + temporal_ref = temporal[0][2] + assert self._triple(g, temporal_ref, RDF.type, DCT.PeriodOfTime) + assert self._triple(g, temporal_ref, DCAT.startDate, extras['temporal_start'], XSD.dateTime) + assert self._triple(g, temporal_ref, DCAT.endDate, extras['temporal_end'], XSD.date) def test_high_value_datasets(self): """ From 8ce31eced6b85d0ed360e1d7f0152cd884113376 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 15 Jul 2024 12:12:53 +0200 Subject: [PATCH 17/18] Add Shacl test for the existing, non-scheming DCAT-AP 2 profile --- ckanext/dcat/tests/test_shacl.py | 30 ++++ .../ckan_full_dataset_dcat_ap_2_legacy.json | 160 ++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 examples/ckan/ckan_full_dataset_dcat_ap_2_legacy.json diff --git a/ckanext/dcat/tests/test_shacl.py b/ckanext/dcat/tests/test_shacl.py index d5506252..dd445fdd 100644 --- a/ckanext/dcat/tests/test_shacl.py +++ b/ckanext/dcat/tests/test_shacl.py @@ -82,3 +82,33 @@ def test_validate_dcat_ap_2_graph_shapes_recommended(): r = validate(graph, shacl_graph=path) conforms, results_graph, results_text = r assert conforms, results_text + + +@pytest.mark.usefixtures("with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "dcat") +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_2") +def test_validate_dcat_ap_2_legacy_graph_shapes(): + + graph = graph_from_dataset("ckan_full_dataset_dcat_ap_2_legacy.json") + + # dcat-ap_2.1.1_shacl_shapes.ttl: constraints concerning existance, domain and + # literal range, and cardinalities. + path = _get_shacl_file_path("dcat-ap_2.1.1_shacl_shapes.ttl") + r = validate(graph, shacl_graph=path) + conforms, results_graph, results_text = r + assert conforms, results_text + + +@pytest.mark.usefixtures("with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "dcat") +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_2") +def test_validate_dcat_ap_2_legacy_graph_shapes_recommended(): + + graph = graph_from_dataset("ckan_full_dataset_dcat_ap_2_legacy.json") + + # dcat-ap_2.1.1_shacl_shapes_recommended.ttl: constraints concerning existance + # of recommended properties. + path = _get_shacl_file_path("dcat-ap_2.1.1_shacl_shapes_recommended.ttl") + r = validate(graph, shacl_graph=path) + conforms, results_graph, results_text = r + assert conforms, results_text diff --git a/examples/ckan/ckan_full_dataset_dcat_ap_2_legacy.json b/examples/ckan/ckan_full_dataset_dcat_ap_2_legacy.json new file mode 100644 index 00000000..358b2c40 --- /dev/null +++ b/examples/ckan/ckan_full_dataset_dcat_ap_2_legacy.json @@ -0,0 +1,160 @@ +{ + "name": "test-dataset-shacl", + "title": "Test DCAT dataset", + "notes": "Lorem ipsum", + "url": "http://example.org/ds1", + "version": "1.0b", + "tags": [ + { + "name": "Tag 1" + }, + { + "name": "Tag 2" + } + ], + "extras": [ + { + "key": "issued", + "value": "2024-05-01" + }, + { + "key": "modified", + "value": "2024-05-05" + }, + { + "key": "identifier", + "value": "xx-some-dataset-id-yy" + }, + { + "key": "frequency", + "value": "monthly" + }, + { + "key": "provenance", + "value": "Statement about provenance" + }, + { + "key": "dcat_type", + "value": "test-type" + }, + { + "key": "version_notes", + "value": "Some version notes" + }, + { + "key": "access_rights", + "value": "Statement about access rights" + }, + { + "key": "alternate_identifier", + "value": "[\"alt-id-1\",\"alt-id-2\"]" + }, + { + "key": "theme", + "value": "[\"https://example.org/uri/theme1\",\"https://example.org/uri/theme2\",\"https://example.org/uri/theme3\"]" + }, + { + "key": "language", + "value": "[\"en\",\"ca\",\"es\"]" + }, + { + "key": "documentation", + "value": "[\"https://example.org/some-doc.html\"]" + }, + { + "key": "conforms_to", + "value": "[\"Standard1\",\"Standard2\"]" + }, + { + "key": "is_referenced_by", + "value": "[\"https://doi.org/10.1038/sdata.2018.22\",\"test_isreferencedby\"]" + }, + { + "key": "applicable_legislation", + "value": "[\"http://data.europa.eu/eli/reg_impl/2023/138/oj\",\"http://data.europa.eu/eli/reg_impl/2023/138/oj_alt\"]" + }, + { + "key": "contact_name", + "value": "Contact 1" + }, + { + "key": "contact_email", + "value": "contact1@example.org" + }, + { + "key": "publisher_name", + "value": "Test Publisher" + }, + { + "key": "publisher_email", + "value": "publisher@example.org" + }, + { + "key": "publisher_url", + "value": "https://example.org" + }, + { + "key": "publisher_type", + "value": "public_body" + }, + { + "key": "temporal_start", + "value": "1905-03-01" + }, + { + "key": "temporal_end", + "value": "2013-01-05" + }, + { + "key": "temporal_resolution", + "value": "PT15M" + }, + { + "key": "spatial", + "value": "{\"type\":\"Polygon\",\"coordinates\":[[[11.9936,54.0486],[11.9936,54.2466],[12.3045,54.2466],[12.3045,54.0486],[11.9936,54.0486]]]}" + }, + { + "key": "spatial_text", + "value": "Tarragona" + }, + { + "key": "spatial_uri", + "value": "https://sws.geonames.org/6361390/" + }, + { + "key": "spatial_bbox", + "value": "{\"type\":\"Polygon\",\"coordinates\":[[[11.9936,54.0486],[11.9936,54.2466],[12.3045,54.2466],[12.3045,54.0486],[11.9936,54.0486]]]}" + }, + { + "key": "spatial_centroid", + "value": "{\"type\":\"Point\",\"coordinates\":[1.26639,41.12386]}" + }, + { + "key": "spatial_resolution_in_meters", + "value": 1.5 + } + ], + "resources": [ + { + "name": "Resource 1", + "description": "Some description", + "url": "https://example.com/data.csv", + "format": "CSV", + "availability": "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL", + "compress_format": "http://www.iana.org/assignments/media-types/application/gzip", + "package_format": "http://publications.europa.eu/resource/authority/file-type/TAR", + "size": 12323, + "hash": "4304cf2e751e6053c90b1804c89c0ebb758f395a", + "hash_algorithm": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1", + "status": "http://purl.org/adms/status/Completed", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "license": "http://creativecommons.org/licenses/by/3.0/", + "rights": "Some stament about rights", + "language": "[\"en\",\"ca\",\"es\"]", + "access_services": "[{\"title\": \"Access Service 1\", \"endpoint_description\": \"https://example.org/endpoint_description\", \"endpoint_url\": [\"https://example.org/access_service/1\", \"https://example.org/access_service/2\"], \"serves_dataset\": [\"https://example.org/dataset/1\", \"https://example.org/dataset/2\"]}]" + } + ] +} From 7cdef98efbe92a5432131d789f20a6e2ea00ce48 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 15 Jul 2024 12:37:54 +0200 Subject: [PATCH 18/18] Update changelog with shacl stuff --- CHANGELOG.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1aa08c7..69aea84f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,21 @@ in the `ckanext/dcat/schemas` folder. See the [documentation](https://github.com/ckan/ckanext-dcat?tab=readme-ov-file#schemas) for all details. Some highlights of the new scheming based profiles: - * Actual list support in the API ooutput for list properties like `dct:language` + * Actual list support in the API output for list properties like `dct:language` * Multiple objects now allowed for properties like `dcat:ContactPoint`, `dct:spatial` or `dct:temporal` * Custom validators for date values that allow `xsd:gYear`, `xsd:gYearMonth`, `xsd:date` and `xsd:dateTime` (#281) +* [SHACL validation](https://github.com/SEMICeu/DCAT-AP/tree/master/releases/2.1.1) for DCAT-AP 2.1.1 profile (scheming and legacy). + SHACL validation made surface the following issues in the existing profiles, which are now fixed: + * Cast `dcat:byteSize` and `dcat:spatialResolutionInMeters` as Decimal, not float + * Allow only one value of `dcat:spatialResolutionInMeters` and `dcat:temporalResolution` + * Only output the WKT version of geometries in `locn:geometry`, `dcat:bbox` and `dcat:centroid`. Sites that for some reason + require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` config option + to choose which format to use + * When using the `euro_dcat_ap_2` profile, don't output temporal extent namespaced + both with `schema` and `dcat`, just with the latter (`dcat:startDate` and `dcat:endDate`) + (#288) * New `ckan dcat consume` and `ckan dcat produce` CLI commands (#279) * Parse dcat:spatialResolutionInMeters as float (#285) * Split profile classes into their own separate files (#282)