diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index fdd3680c..b16f15d9 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -126,6 +126,13 @@ class RDFProfile(object): _dataset_schema = None + # Cache for mappings of licenses URL/title to ID built when needed in + # _license(). + _licenceregister_cache = None + + # Cache for organization_show details (used for publisher fallback) + _org_cache: dict = {} + def __init__(self, graph, dataset_type='dataset', compatibility_mode=False): '''Class constructor @@ -144,10 +151,6 @@ def __init__(self, graph, dataset_type='dataset', compatibility_mode=False): self.compatibility_mode = compatibility_mode - # Cache for mappings of licenses URL/title to ID built when needed in - # _license(). - self._licenceregister_cache = None - try: schema_show = toolkit.get_action("scheming_dataset_schema_show") try: @@ -1365,45 +1368,61 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ) # Publisher - if any([ + publisher_ref = None + + if dataset_dict.get('publisher'): + # Scheming publisher field: will be handled in a separate profile + pass + elif any([ self._get_dataset_value(dataset_dict, 'publisher_uri'), self._get_dataset_value(dataset_dict, 'publisher_name'), - dataset_dict.get('organization'), ]): - + # Legacy publisher_* extras publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri') - publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if publisher_uri: - publisher_details = CleanedURIRef(publisher_uri) - elif not publisher_name and publisher_uri_fallback: - # neither URI nor name are available, use organization as fallback - publisher_details = CleanedURIRef(publisher_uri_fallback) + publisher_ref = CleanedURIRef(publisher_uri) else: # No publisher_uri - publisher_details = BNode() - - g.add((publisher_details, RDF.type, FOAF.Organization)) - g.add((dataset_ref, DCT.publisher, publisher_details)) - - # In case no name and URI are available, again fall back to organization. - # If no name but an URI is available, the name literal remains empty to - # avoid mixing organization and dataset values. - if not publisher_name and not publisher_uri and dataset_dict.get('organization'): - publisher_name = dataset_dict['organization']['title'] - - g.add((publisher_details, FOAF.name, Literal(publisher_name))) - # TODO: It would make sense to fallback these to organization - # fields but they are not in the default schema and the - # `organization` object in the dataset_dict does not include - # custom fields + publisher_ref = BNode() + publisher_details = { + 'name': publisher_name, + 'email': self._get_dataset_value(dataset_dict, 'publisher_email'), + 'url': self._get_dataset_value(dataset_dict, 'publisher_url'), + 'type': self._get_dataset_value(dataset_dict, 'publisher_type'), + } + elif dataset_dict.get('organization'): + # Fall back to dataset org + org_id = dataset_dict['organization']['id'] + org_dict = None + if org_id in self._org_cache: + org_dict = self._org_cache[org_id] + else: + try: + org_dict = toolkit.get_action('organization_show')( + {'ignore_auth': True}, {'id': org_id}) + self._org_cache[org_id] = org_dict + except toolkit.ObjectNotFound: + pass + if org_dict: + publisher_ref = CleanedURIRef(publisher_uri_organization_fallback(dataset_dict)) + publisher_details = { + 'name': org_dict.get('title'), + 'email': org_dict.get('email'), + 'url': org_dict.get('url'), + 'type': org_dict.get('dcat_type'), + } + # Add to graph + if publisher_ref: + g.add((publisher_ref, RDF.type, FOAF.Organization)) + g.add((dataset_ref, DCT.publisher, publisher_ref)) items = [ - ('publisher_email', FOAF.mbox, None, Literal), - ('publisher_url', FOAF.homepage, None, URIRef), - ('publisher_type', DCT.type, None, URIRefOrLiteral), + ('name', FOAF.name, None, Literal), + ('email', FOAF.mbox, None, Literal), + ('url', FOAF.homepage, None, URIRef), + ('type', DCT.type, None, URIRefOrLiteral), ] - - self._add_triples_from_dict(dataset_dict, publisher_details, items) + self._add_triples_from_dict(publisher_details, publisher_ref, items) # Temporal start = self._get_dataset_value(dataset_dict, 'temporal_start') @@ -2207,6 +2226,33 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): _type=URIRef, value_modifier=self._add_mailto ) + publisher = dataset_dict.get("publisher") + if isinstance(publisher, list) and len(publisher): + publisher = publisher[0] + publisher_uri = publisher.get('uri') + if publisher_uri: + publisher_ref = CleanedURIRef(publisher_uri) + else: + publisher_ref = BNode() + + self.g.add((publisher_ref, RDF.type, FOAF.Organization)) + self.g.add((dataset_ref, DCT.publisher, publisher_ref)) + + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.name, 'name' + ) + self._add_triple_from_dict( + publisher, publisher_ref, FOAF.homepage, 'url', URIRef + ) + self._add_triple_from_dict( + publisher, publisher_ref, DCT.type, 'type', URIRefOrLiteral + ) + self._add_triple_from_dict( + publisher, publisher_ref, + VCARD.hasEmail, 'email', + _type=URIRef, value_modifier=self._add_mailto + ) + resources = dataset_dict.get('resources', []) for resource in resources: if resource.get('access_services'): diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index a3ddf67a..f5373c3a 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -3,7 +3,6 @@ dataset_type: dataset about: A reimplementation of the default CKAN dataset schema about_url: http://github.com/ckan/ckanext-dcat - dataset_fields: - field_name: title @@ -23,6 +22,11 @@ dataset_fields: form_snippet: markdown.html form_placeholder: eg. Some useful notes about the data +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + - field_name: contact label: Contact points repeating_label: Contact point @@ -38,10 +42,28 @@ dataset_fields: label: Email display_snippet: email.html -- field_name: tag_string - label: Keywords - preset: tag_string_autocomplete - form_placeholder: eg. economy, mental health, government +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type - field_name: license_id label: License @@ -209,4 +231,3 @@ resource_fields: # Note: if not provided, this will be autogenerated - field_name: uri label: URI - diff --git a/ckanext/dcat/schemas/publisher_organization.yaml b/ckanext/dcat/schemas/publisher_organization.yaml new file mode 100644 index 00000000..3d1f7d3b --- /dev/null +++ b/ckanext/dcat/schemas/publisher_organization.yaml @@ -0,0 +1,35 @@ +scheming_version: 2 +about_url: http://github.com/ckan/ckanext-dcat +description: > + An organization schema that implements the properties supported + by default in the dct:publisher property of a dcat:Dataset + +fields: + +- field_name: title + label: Name + validators: ignore_missing unicode_safe + form_snippet: large_text.html + form_attrs: {data-module: slug-preview-target} + +- field_name: name + label: URL + validators: not_empty unicode_safe name_validator group_name_validator + form_snippet: slug.html + form_placeholder: my-theme + +- field_name: notes + label: Description + form_snippet: markdown.html + form_placeholder: A little information about this organization. + +- field_name: email + label: Email + display_snippet: email.html + +- field_name: url + label: URL + display_snippet: link.html + +- field_name: dcat_type + label: Type diff --git a/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html b/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html new file mode 100644 index 00000000..dec11f45 --- /dev/null +++ b/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html @@ -0,0 +1,8 @@ +{% ckan_extends %} + +{% block add_button %} + {# Hide the Add button if we only want one set of subfields #} + {% if not field.repeating_once %} + {{ super() }} + {% endif %} +{% endblock %} diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index a389acfd..fd167736 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -1,6 +1,7 @@ from builtins import str from builtins import object import json +import uuid import pytest @@ -17,7 +18,7 @@ from ckanext.dcat import utils from ckanext.dcat.processors import RDFSerializer, HYDRA from ckanext.dcat.profiles import (DCAT, DCT, ADMS, XSD, VCARD, FOAF, SCHEMA, - SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT, + SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT, DISTRIBUTION_LICENSE_FALLBACK_CONFIG) from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS from ckanext.dcat.tests.utils import BaseSerializeTest @@ -398,11 +399,17 @@ def test_publisher_extras(self): assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type'])) def test_publisher_org(self): + org_id = str(uuid.uuid4()) + factories.Organization( + id=org_id, + name='publisher1', + title='Example Publisher from Org' + ) dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'organization': { - 'id': '', + 'id': org_id, 'name': 'publisher1', 'title': 'Example Publisher from Org', } diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 33a507d5..ffa682b7 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -3,6 +3,7 @@ from rdflib.namespace import RDF from rdflib.term import URIRef +from ckan.tests import factories from ckan.tests.helpers import call_action from ckanext.dcat import utils @@ -74,6 +75,14 @@ def test_e2e_ckan_to_dcat(self): {"name": "Contact 1", "email": "contact1@example.org"}, {"name": "Contact 2", "email": "contact2@example.org"}, ], + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body", + }, + ], "resources": [ { "name": "Resource 1", @@ -187,7 +196,32 @@ def test_e2e_ckan_to_dcat(self): g, contact_details[1][2], VCARD.hasEmail, - dataset_dict["contact"][1]["email"], + URIRef("mailto:" + dataset_dict["contact"][1]["email"]), + ) + + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + + assert len(publisher) == 1 + assert self._triple( + g, publisher[0][2], FOAF.name, dataset_dict["publisher"][0]["name"] + ) + assert self._triple( + g, + publisher[0][2], + VCARD.hasEmail, + URIRef("mailto:" + dataset_dict["publisher"][0]["email"]), + ) + assert self._triple( + g, + publisher[0][2], + FOAF.homepage, + dataset_dict["publisher"][0]["url"], + ) + assert self._triple( + g, + publisher[0][2], + DCT.type, + dataset_dict["publisher"][0]["type"], ) distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] @@ -270,6 +304,60 @@ def test_e2e_ckan_to_dcat(self): == dataset_dict["resources"][0]["access_services"][0]["endpoint_url"] ) + def test_publisher_fallback_org(self): + + org = factories.Organization( + title="Some publisher org", + ) + dataset_dict = { + "name": "test-dataset-2", + "title": "Test DCAT dataset 2", + "notes": "Lorem ipsum", + "owner_org": org["id"], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + + assert len(publisher) == 1 + assert self._triple(g, publisher[0][2], FOAF.name, org["title"]) + + def test_publisher_fallback_org_ignored_if_publisher_field_present(self): + + org = factories.Organization() + dataset_dict = { + "name": "test-dataset-2", + "title": "Test DCAT dataset 2", + "notes": "Lorem ipsum", + "publisher": [ + { + "name": "Test Publisher", + "email": "publisher@example.org", + "url": "https://example.org", + "type": "public_body", + }, + ], + "owner_org": org["id"], + } + + dataset = call_action("package_create", **dataset_dict) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + publisher = [t for t in g.triples((dataset_ref, DCT.publisher, None))] + + assert len(publisher) == 1 + assert self._triple( + g, publisher[0][2], FOAF.name, dataset_dict["publisher"][0]["name"] + ) + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -350,6 +438,16 @@ def test_e2e_dcat_to_ckan(self): assert dataset["contact"][0]["name"] == "Point of Contact" assert dataset["contact"][0]["email"] == "contact@some.org" + assert ( + dataset["publisher"][0]["name"] == "Publishing Organization for dataset 1" + ) + assert dataset["publisher"][0]["email"] == "contact@some.org" + assert dataset["publisher"][0]["url"] == "http://some.org" + assert ( + dataset["publisher"][0]["type"] + == "http://purl.org/adms/publishertype/NonProfitOrganisation" + ) + resource = dataset["resources"][0] # Resources: core fields