Fix merge errors

ckan · May 29, 2024 · 1bce834 · 1bce834
1 parent 214d853
commit 1bce834
Show file tree

Hide file tree

Showing 6 changed files with 388 additions and 146 deletions.
diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py
@@ -20,4 +20,5 @@
 
 from .euro_dcat_ap import EuropeanDCATAPProfile
 from .euro_dcat_ap_2 import EuropeanDCATAP2Profile
+from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile
 from .schemaorg import SchemaOrgProfile
diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py
@@ -7,7 +7,7 @@
 from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS
 from geomet import wkt, InvalidGeoJSONException
 
-from ckantoolkit import config, url_for, asbool, get_action
+from ckantoolkit import config, url_for, asbool, get_action, ObjectNotFound
 from ckan.model.license import LicenseRegister
 from ckan.lib.helpers import resource_formats
 from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
@@ -41,7 +41,7 @@
     "spdx": SPDX,
 }
 
-PREFIX_MAILTO = u"mailto:"
+PREFIX_MAILTO = "mailto:"
 
 GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json"
 
@@ -105,11 +105,20 @@ class RDFProfile(object):
     custom profiles
     """
 
-    def __init__(self, graph, compatibility_mode=False):
-        """Class constructor
+    _dataset_schema = None
 
-        Graph is an rdflib.Graph instance.
+    # Cache for mappings of licenses URL/title to ID built when needed in
+    # _license().
+    _licenceregister_cache = None
 
+    # Cache for organization_show details (used for publisher fallback)
+    _org_cache: dict = {}
+
+    def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):
+        """Class constructor
+        Graph is an rdflib.Graph instance.
+        A scheming dataset type can be provided, in which case the scheming schema
+        will be loaded so it can be used by profiles.
         In compatibility mode, some fields are modified to maintain
         compatibility with previous versions of the ckanext-dcat parsers
         (eg adding the `dcat_` prefix or storing comma separated lists instead
@@ -120,9 +129,17 @@ def __init__(self, graph, compatibility_mode=False):
 
         self.compatibility_mode = compatibility_mode
 
-        # Cache for mappings of licenses URL/title to ID built when needed in
-        # _license().
-        self._licenceregister_cache = None
+        try:
+            schema_show = get_action("scheming_dataset_schema_show")
+            try:
+                schema = schema_show({}, {"type": dataset_type})
+            except ObjectNotFound:
+                raise ObjectNotFound(f"Unknown dataset schema: {dataset_type}")
+
+            self._dataset_schema = schema
+
+        except KeyError:
+            pass
 
     def _datasets(self):
         """
@@ -707,6 +724,64 @@ def _add_spatial_to_dict(self, dataset_dict, key, spatial):
                 }
             )
 
+    def _schema_field(self, key):
+        """
+        Returns the schema field information if the provided key exists as a field in
+        the dataset schema (if one was provided)
+        """
+        if not self._dataset_schema:
+            return None
+
+        for field in self._dataset_schema["dataset_fields"]:
+            if field["field_name"] == key:
+                return field
+
+    def _schema_resource_field(self, key):
+        """
+        Returns the schema field information if the provided key exists as a field in
+        the resources fields of the dataset schema (if one was provided)
+        """
+        if not self._dataset_schema:
+            return None
+
+        for field in self._dataset_schema["resource_fields"]:
+            if field["field_name"] == key:
+                return field
+
+    def _set_dataset_value(self, dataset_dict, key, value):
+        """
+        Sets the value for a given key in a CKAN dataset dict
+        If a dataset schema was provided, the schema will be checked to see if
+        a custom field is present for the key. If so the key will be stored at
+        the dict root level, otherwise it will be stored as an extra.
+        Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored
+        at the root level.
+        """
+        if self._schema_field(key) or key in ROOT_DATASET_FIELDS:
+            dataset_dict[key] = value
+        else:
+            if not dataset_dict.get("extras"):
+                dataset_dict["extras"] = []
+            dataset_dict["extras"].append({"key": key, "value": value})
+
+        return dataset_dict
+
+    def _set_list_dataset_value(self, dataset_dict, key, value):
+        schema_field = self._schema_field(key)
+        if schema_field and "scheming_multiple_text" in schema_field["validators"]:
+            return self._set_dataset_value(dataset_dict, key, value)
+        else:
+            return self._set_dataset_value(dataset_dict, key, json.dumps(value))
+
+    def _set_list_resource_value(self, resource_dict, key, value):
+        schema_field = self._schema_resource_field(key)
+        if schema_field and "scheming_multiple_text" in schema_field["validators"]:
+            resource_dict[key] = value
+        else:
+            resource_dict[key] = json.dumps(value)
+
+        return resource_dict
+
     def _get_dataset_value(self, dataset_dict, key, default=None):
         """
         Returns the value for the given key on a CKAN dict
@@ -880,7 +955,7 @@ def _without_mailto(self, mail_addr):
         Ensures that the mail address string has no mailto: prefix.
         """
         if mail_addr:
-            return str(mail_addr).replace(PREFIX_MAILTO, u"")
+            return str(mail_addr).replace(PREFIX_MAILTO, "")
         else:
             return mail_addr
 

diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py
@@ -20,11 +20,9 @@
     DCAT,
     DCT,
     ADMS,
-    XSD,
     VCARD,
     FOAF,
     SCHEMA,
-    SKOS,
     LOCN,
     GSP,
     OWL,
@@ -354,51 +352,66 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             )
 
         # Publisher
-        if any(
+        publisher_ref = None
+
+        if dataset_dict.get("publisher"):
+            # Scheming publisher field: will be handled in a separate profile
+            pass
+        elif any(
             [
                 self._get_dataset_value(dataset_dict, "publisher_uri"),
                 self._get_dataset_value(dataset_dict, "publisher_name"),
-                dataset_dict.get("organization"),
             ]
         ):
-
+            # Legacy publisher_* extras
             publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri")
-            publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
             publisher_name = self._get_dataset_value(dataset_dict, "publisher_name")
             if publisher_uri:
-                publisher_details = CleanedURIRef(publisher_uri)
-            elif not publisher_name and publisher_uri_fallback:
-                # neither URI nor name are available, use organization as fallback
-                publisher_details = CleanedURIRef(publisher_uri_fallback)
+                publisher_ref = CleanedURIRef(publisher_uri)
             else:
                 # No publisher_uri
-                publisher_details = BNode()
-
-            g.add((publisher_details, RDF.type, FOAF.Organization))
-            g.add((dataset_ref, DCT.publisher, publisher_details))
-
-            # In case no name and URI are available, again fall back to organization.
-            # If no name but an URI is available, the name literal remains empty to
-            # avoid mixing organization and dataset values.
-            if (
-                not publisher_name
-                and not publisher_uri
-                and dataset_dict.get("organization")
-            ):
-                publisher_name = dataset_dict["organization"]["title"]
-
-            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
-            # TODO: It would make sense to fallback these to organization
-            # fields but they are not in the default schema and the
-            # `organization` object in the dataset_dict does not include
-            # custom fields
+                publisher_ref = BNode()
+            publisher_details = {
+                "name": publisher_name,
+                "email": self._get_dataset_value(dataset_dict, "publisher_email"),
+                "url": self._get_dataset_value(dataset_dict, "publisher_url"),
+                "type": self._get_dataset_value(dataset_dict, "publisher_type"),
+            }
+        elif dataset_dict.get("organization"):
+            # Fall back to dataset org
+            org_id = dataset_dict["organization"]["id"]
+            org_dict = None
+            if org_id in self._org_cache:
+                org_dict = self._org_cache[org_id]
+            else:
+                try:
+                    org_dict = toolkit.get_action("organization_show")(
+                        {"ignore_auth": True}, {"id": org_id}
+                    )
+                    self._org_cache[org_id] = org_dict
+                except toolkit.ObjectNotFound:
+                    pass
+            if org_dict:
+                publisher_ref = CleanedURIRef(
+                    publisher_uri_organization_fallback(dataset_dict)
+                )
+                publisher_details = {
+                    "name": org_dict.get("title"),
+                    "email": org_dict.get("email"),
+                    "url": org_dict.get("url"),
+                    "type": org_dict.get("dcat_type"),
+                }
+        # Add to graph
+        if publisher_ref:
+            g.add((publisher_ref, RDF.type, FOAF.Organization))
+            g.add((dataset_ref, DCT.publisher, publisher_ref))
             items = [
-                ("publisher_email", FOAF.mbox, None, Literal),
-                ("publisher_url", FOAF.homepage, None, URIRef),
-                ("publisher_type", DCT.type, None, URIRefOrLiteral),
+                ("name", FOAF.name, None, Literal),
+                ("email", FOAF.mbox, None, Literal),
+                ("url", FOAF.homepage, None, URIRef),
+                ("type", DCT.type, None, URIRefOrLiteral),
             ]
-
-            self._add_triples_from_dict(dataset_dict, publisher_details, items)
+            self._add_triples_from_dict(publisher_details, publisher_ref, items)
 
         # Temporal
         start = self._get_dataset_value(dataset_dict, "temporal_start")