From 1bce8344793ff92bb172d17f4ab4cc7383e18d11 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Wed, 29 May 2024 11:10:06 +0200
Subject: [PATCH] Fix merge errors

---
 ckanext/dcat/profiles/__init__.py             |   1 +
 ckanext/dcat/profiles/base.py                 |  93 ++++++++-
 ckanext/dcat/profiles/euro_dcat_ap.py         |  83 ++++----
 ckanext/dcat/profiles/euro_dcat_ap_2.py       | 188 +++++++++---------
 .../dcat/profiles/euro_dcat_ap_scheming.py    | 166 ++++++++++++++++
 ckanext/dcat/tests/test_scheming_support.py   |   3 -
 6 files changed, 388 insertions(+), 146 deletions(-)
 create mode 100644 ckanext/dcat/profiles/euro_dcat_ap_scheming.py

diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py
index 92266c72..a80a48c6 100644
--- a/ckanext/dcat/profiles/__init__.py
+++ b/ckanext/dcat/profiles/__init__.py
@@ -20,4 +20,5 @@
 
 from .euro_dcat_ap import EuropeanDCATAPProfile
 from .euro_dcat_ap_2 import EuropeanDCATAP2Profile
+from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile
 from .schemaorg import SchemaOrgProfile
diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py
index 4b652b91..c91a1e8e 100644
--- a/ckanext/dcat/profiles/base.py
+++ b/ckanext/dcat/profiles/base.py
@@ -7,7 +7,7 @@
 from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS
 from geomet import wkt, InvalidGeoJSONException
 
-from ckantoolkit import config, url_for, asbool, get_action
+from ckantoolkit import config, url_for, asbool, get_action, ObjectNotFound
 from ckan.model.license import LicenseRegister
 from ckan.lib.helpers import resource_formats
 from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
@@ -41,7 +41,7 @@
     "spdx": SPDX,
 }
 
-PREFIX_MAILTO = u"mailto:"
+PREFIX_MAILTO = "mailto:"
 
 GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json"
 
@@ -105,11 +105,20 @@ class RDFProfile(object):
     custom profiles
     """
 
-    def __init__(self, graph, compatibility_mode=False):
-        """Class constructor
+    _dataset_schema = None
 
-        Graph is an rdflib.Graph instance.
+    # Cache for mappings of licenses URL/title to ID built when needed in
+    # _license().
+    _licenceregister_cache = None
 
+    # Cache for organization_show details (used for publisher fallback)
+    _org_cache: dict = {}
+
+    def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):
+        """Class constructor
+        Graph is an rdflib.Graph instance.
+        A scheming dataset type can be provided, in which case the scheming schema
+        will be loaded so it can be used by profiles.
         In compatibility mode, some fields are modified to maintain
         compatibility with previous versions of the ckanext-dcat parsers
         (eg adding the `dcat_` prefix or storing comma separated lists instead
@@ -120,9 +129,17 @@ def __init__(self, graph, compatibility_mode=False):
 
         self.compatibility_mode = compatibility_mode
 
-        # Cache for mappings of licenses URL/title to ID built when needed in
-        # _license().
-        self._licenceregister_cache = None
+        try:
+            schema_show = get_action("scheming_dataset_schema_show")
+            try:
+                schema = schema_show({}, {"type": dataset_type})
+            except ObjectNotFound:
+                raise ObjectNotFound(f"Unknown dataset schema: {dataset_type}")
+
+            self._dataset_schema = schema
+
+        except KeyError:
+            pass
 
     def _datasets(self):
         """
@@ -707,6 +724,64 @@ def _add_spatial_to_dict(self, dataset_dict, key, spatial):
                 }
             )
 
+    def _schema_field(self, key):
+        """
+        Returns the schema field information if the provided key exists as a field in
+        the dataset schema (if one was provided)
+        """
+        if not self._dataset_schema:
+            return None
+
+        for field in self._dataset_schema["dataset_fields"]:
+            if field["field_name"] == key:
+                return field
+
+    def _schema_resource_field(self, key):
+        """
+        Returns the schema field information if the provided key exists as a field in
+        the resources fields of the dataset schema (if one was provided)
+        """
+        if not self._dataset_schema:
+            return None
+
+        for field in self._dataset_schema["resource_fields"]:
+            if field["field_name"] == key:
+                return field
+
+    def _set_dataset_value(self, dataset_dict, key, value):
+        """
+        Sets the value for a given key in a CKAN dataset dict
+        If a dataset schema was provided, the schema will be checked to see if
+        a custom field is present for the key. If so the key will be stored at
+        the dict root level, otherwise it will be stored as an extra.
+        Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored
+        at the root level.
+        """
+        if self._schema_field(key) or key in ROOT_DATASET_FIELDS:
+            dataset_dict[key] = value
+        else:
+            if not dataset_dict.get("extras"):
+                dataset_dict["extras"] = []
+            dataset_dict["extras"].append({"key": key, "value": value})
+
+        return dataset_dict
+
+    def _set_list_dataset_value(self, dataset_dict, key, value):
+        schema_field = self._schema_field(key)
+        if schema_field and "scheming_multiple_text" in schema_field["validators"]:
+            return self._set_dataset_value(dataset_dict, key, value)
+        else:
+            return self._set_dataset_value(dataset_dict, key, json.dumps(value))
+
+    def _set_list_resource_value(self, resource_dict, key, value):
+        schema_field = self._schema_resource_field(key)
+        if schema_field and "scheming_multiple_text" in schema_field["validators"]:
+            resource_dict[key] = value
+        else:
+            resource_dict[key] = json.dumps(value)
+
+        return resource_dict
+
     def _get_dataset_value(self, dataset_dict, key, default=None):
         """
         Returns the value for the given key on a CKAN dict
@@ -880,7 +955,7 @@ def _without_mailto(self, mail_addr):
         Ensures that the mail address string has no mailto: prefix.
         """
         if mail_addr:
-            return str(mail_addr).replace(PREFIX_MAILTO, u"")
+            return str(mail_addr).replace(PREFIX_MAILTO, "")
         else:
             return mail_addr
 
diff --git a/ckanext/dcat/profiles/euro_dcat_ap.py b/ckanext/dcat/profiles/euro_dcat_ap.py
index 9a4c853b..b7e4cae4 100644
--- a/ckanext/dcat/profiles/euro_dcat_ap.py
+++ b/ckanext/dcat/profiles/euro_dcat_ap.py
@@ -20,11 +20,9 @@
     DCAT,
     DCT,
     ADMS,
-    XSD,
     VCARD,
     FOAF,
     SCHEMA,
-    SKOS,
     LOCN,
     GSP,
     OWL,
@@ -354,51 +352,66 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             )
 
         # Publisher
-        if any(
+        publisher_ref = None
+
+        if dataset_dict.get("publisher"):
+            # Scheming publisher field: will be handled in a separate profile
+            pass
+        elif any(
             [
                 self._get_dataset_value(dataset_dict, "publisher_uri"),
                 self._get_dataset_value(dataset_dict, "publisher_name"),
-                dataset_dict.get("organization"),
             ]
         ):
-
+            # Legacy publisher_* extras
             publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri")
-            publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
             publisher_name = self._get_dataset_value(dataset_dict, "publisher_name")
             if publisher_uri:
-                publisher_details = CleanedURIRef(publisher_uri)
-            elif not publisher_name and publisher_uri_fallback:
-                # neither URI nor name are available, use organization as fallback
-                publisher_details = CleanedURIRef(publisher_uri_fallback)
+                publisher_ref = CleanedURIRef(publisher_uri)
             else:
                 # No publisher_uri
-                publisher_details = BNode()
-
-            g.add((publisher_details, RDF.type, FOAF.Organization))
-            g.add((dataset_ref, DCT.publisher, publisher_details))
-
-            # In case no name and URI are available, again fall back to organization.
-            # If no name but an URI is available, the name literal remains empty to
-            # avoid mixing organization and dataset values.
-            if (
-                not publisher_name
-                and not publisher_uri
-                and dataset_dict.get("organization")
-            ):
-                publisher_name = dataset_dict["organization"]["title"]
-
-            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
-            # TODO: It would make sense to fallback these to organization
-            # fields but they are not in the default schema and the
-            # `organization` object in the dataset_dict does not include
-            # custom fields
+                publisher_ref = BNode()
+            publisher_details = {
+                "name": publisher_name,
+                "email": self._get_dataset_value(dataset_dict, "publisher_email"),
+                "url": self._get_dataset_value(dataset_dict, "publisher_url"),
+                "type": self._get_dataset_value(dataset_dict, "publisher_type"),
+            }
+        elif dataset_dict.get("organization"):
+            # Fall back to dataset org
+            org_id = dataset_dict["organization"]["id"]
+            org_dict = None
+            if org_id in self._org_cache:
+                org_dict = self._org_cache[org_id]
+            else:
+                try:
+                    org_dict = toolkit.get_action("organization_show")(
+                        {"ignore_auth": True}, {"id": org_id}
+                    )
+                    self._org_cache[org_id] = org_dict
+                except toolkit.ObjectNotFound:
+                    pass
+            if org_dict:
+                publisher_ref = CleanedURIRef(
+                    publisher_uri_organization_fallback(dataset_dict)
+                )
+                publisher_details = {
+                    "name": org_dict.get("title"),
+                    "email": org_dict.get("email"),
+                    "url": org_dict.get("url"),
+                    "type": org_dict.get("dcat_type"),
+                }
+        # Add to graph
+        if publisher_ref:
+            g.add((publisher_ref, RDF.type, FOAF.Organization))
+            g.add((dataset_ref, DCT.publisher, publisher_ref))
             items = [
-                ("publisher_email", FOAF.mbox, None, Literal),
-                ("publisher_url", FOAF.homepage, None, URIRef),
-                ("publisher_type", DCT.type, None, URIRefOrLiteral),
+                ("name", FOAF.name, None, Literal),
+                ("email", FOAF.mbox, None, Literal),
+                ("url", FOAF.homepage, None, URIRef),
+                ("type", DCT.type, None, URIRefOrLiteral),
             ]
-
-            self._add_triples_from_dict(dataset_dict, publisher_details, items)
+            self._add_triples_from_dict(publisher_details, publisher_ref, items)
 
         # Temporal
         start = self._get_dataset_value(dataset_dict, "temporal_start")
diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py
index 6f40e3ab..3db77d61 100644
--- a/ckanext/dcat/profiles/euro_dcat_ap_2.py
+++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py
@@ -91,56 +91,52 @@ def parse_dataset(self, dataset_dict, dataset_ref):
                         if values:
                             resource_dict[key] = json.dumps(values)
 
-                        # Access services
-                        access_service_list = []
+                    # Access services
+                    access_service_list = []
 
-                        for access_service in self.g.objects(
-                            distribution, DCAT.accessService
+                    for access_service in self.g.objects(
+                        distribution, DCAT.accessService
+                    ):
+                        access_service_dict = {}
+
+                        #  Simple values
+                        for key, predicate in (
+                            ("availability", DCATAP.availability),
+                            ("title", DCT.title),
+                            ("endpoint_description", DCAT.endpointDescription),
+                            ("license", DCT.license),
+                            ("access_rights", DCT.accessRights),
+                            ("description", DCT.description),
+                        ):
+                            value = self._object_value(access_service, predicate)
+                            if value:
+                                access_service_dict[key] = value
+                        #  List
+                        for key, predicate in (
+                            ("endpoint_url", DCAT.endpointURL),
+                            ("serves_dataset", DCAT.servesDataset),
                         ):
-                            access_service_dict = {}
-
-                            #  Simple values
-                            for key, predicate in (
-                                ("availability", DCATAP.availability),
-                                ("title", DCT.title),
-                                ("endpoint_description", DCAT.endpointDescription),
-                                ("license", DCT.license),
-                                ("access_rights", DCT.accessRights),
-                                ("description", DCT.description),
-                            ):
-                                value = self._object_value(access_service, predicate)
-                                if value:
-                                    access_service_dict[key] = value
-                            #  List
-                            for key, predicate in (
-                                ("endpoint_url", DCAT.endpointURL),
-                                ("serves_dataset", DCAT.servesDataset),
-                            ):
-                                values = self._object_value_list(
-                                    access_service, predicate
-                                )
-                                if values:
-                                    access_service_dict[key] = values
-
-                            # Access service URI (explicitly show the missing ones)
-                            access_service_dict["uri"] = (
-                                str(access_service)
-                                if isinstance(access_service, URIRef)
-                                else ""
-                            )
-
-                            # Remember the (internal) access service reference for referencing in
-                            # further profiles, e.g. for adding more properties
-                            access_service_dict["access_service_ref"] = str(
-                                access_service
-                            )
-
-                            access_service_list.append(access_service_dict)
-
-                        if access_service_list:
-                            resource_dict["access_services"] = json.dumps(
-                                access_service_list
-                            )
+                            values = self._object_value_list(access_service, predicate)
+                            if values:
+                                access_service_dict[key] = values
+
+                        # Access service URI (explicitly show the missing ones)
+                        access_service_dict["uri"] = (
+                            str(access_service)
+                            if isinstance(access_service, URIRef)
+                            else ""
+                        )
+
+                        # Remember the (internal) access service reference for referencing in
+                        # further profiles, e.g. for adding more properties
+                        access_service_dict["access_service_ref"] = str(access_service)
+
+                        access_service_list.append(access_service_dict)
+
+                    if access_service_list:
+                        resource_dict["access_services"] = json.dumps(
+                            access_service_list
+                        )
 
         return dataset_dict
 
@@ -253,60 +249,54 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             ]
             self._add_list_triples_from_dict(resource_dict, distribution, items)
 
-            try:
-                access_service_list = json.loads(
-                    resource_dict.get("access_services", "[]")
+            # Access services
+            access_service_list = resource_dict.get("access_services", [])
+            if isinstance(access_service_list, str):
+                try:
+                    access_service_list = json.loads(access_service_list)
+                except ValueError:
+                    access_service_list = []
+
+            for access_service_dict in access_service_list:
+
+                access_service_uri = access_service_dict.get("uri")
+                if access_service_uri:
+                    access_service_node = CleanedURIRef(access_service_uri)
+                else:
+                    access_service_node = BNode()
+                    # Remember the (internal) access service reference for referencing in
+                    # further profiles
+                    access_service_dict["access_service_ref"] = str(access_service_node)
+
+                self.g.add((distribution, DCAT.accessService, access_service_node))
+
+                self.g.add((access_service_node, RDF.type, DCAT.DataService))
+
+                #  Simple values
+                items = [
+                    ("availability", DCATAP.availability, None, URIRefOrLiteral),
+                    ("license", DCT.license, None, URIRefOrLiteral),
+                    ("access_rights", DCT.accessRights, None, URIRefOrLiteral),
+                    ("title", DCT.title, None, Literal),
+                    ("endpoint_description", DCAT.endpointDescription, None, Literal),
+                    ("description", DCT.description, None, Literal),
+                ]
+
+                self._add_triples_from_dict(
+                    access_service_dict, access_service_node, items
                 )
-                # Access service
-                for access_service_dict in access_service_list:
-
-                    access_service_uri = access_service_dict.get("uri")
-                    if access_service_uri:
-                        access_service_node = CleanedURIRef(access_service_uri)
-                    else:
-                        access_service_node = BNode()
-                        # Remember the (internal) access service reference for referencing in
-                        # further profiles
-                        access_service_dict["access_service_ref"] = str(
-                            access_service_node
-                        )
-
-                    self.g.add((distribution, DCAT.accessService, access_service_node))
-
-                    self.g.add((access_service_node, RDF.type, DCAT.DataService))
-
-                    #  Simple values
-                    items = [
-                        ("availability", DCATAP.availability, None, URIRefOrLiteral),
-                        ("license", DCT.license, None, URIRefOrLiteral),
-                        ("access_rights", DCT.accessRights, None, URIRefOrLiteral),
-                        ("title", DCT.title, None, Literal),
-                        (
-                            "endpoint_description",
-                            DCAT.endpointDescription,
-                            None,
-                            Literal,
-                        ),
-                        ("description", DCT.description, None, Literal),
-                    ]
-
-                    self._add_triples_from_dict(
-                        access_service_dict, access_service_node, items
-                    )
 
-                    #  Lists
-                    items = [
-                        ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral),
-                        ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral),
-                    ]
-                    self._add_list_triples_from_dict(
-                        access_service_dict, access_service_node, items
-                    )
+                #  Lists
+                items = [
+                    ("endpoint_url", DCAT.endpointURL, None, URIRefOrLiteral),
+                    ("serves_dataset", DCAT.servesDataset, None, URIRefOrLiteral),
+                ]
+                self._add_list_triples_from_dict(
+                    access_service_dict, access_service_node, items
+                )
 
-                if access_service_list:
-                    resource_dict["access_services"] = json.dumps(access_service_list)
-            except ValueError:
-                pass
+            if access_service_list:
+                resource_dict["access_services"] = json.dumps(access_service_list)
 
     def graph_from_catalog(self, catalog_dict, catalog_ref):
 
diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
new file mode 100644
index 00000000..6bd570a9
--- /dev/null
+++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
@@ -0,0 +1,166 @@
+import json
+
+from rdflib import URIRef, BNode
+from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral
+from .base import (
+    RDF,
+    XSD,
+    DCAT,
+    DCT,
+    VCARD,
+    FOAF,
+)
+
+
+class EuropeanDCATAPSchemingProfile(RDFProfile):
+    """
+    This is a compatibilty profile meant to add support for ckanext-scheming to the existing
+    `euro_dcat_ap` and `euro_dcat_ap_2` profiles.
+    It does not add or remove any properties from these profiles, it just transforms the
+    resulting dataset_dict so it is compatible with a ckanext-scheming schema
+    TODO: summarize changes and link to docs
+    """
+
+    def parse_dataset(self, dataset_dict, dataset_ref):
+
+        if not self._dataset_schema:
+            # Not using scheming
+            return dataset_dict
+
+        # Move extras to root
+
+        extras_to_remove = []
+        extras = dataset_dict.get("extras", [])
+        for extra in extras:
+            if self._schema_field(extra["key"]):
+                # This is a field defined in the dataset schema
+                dataset_dict[extra["key"]] = extra["value"]
+                extras_to_remove.append(extra["key"])
+
+        dataset_dict["extras"] = [e for e in extras if e["key"] not in extras_to_remove]
+
+        # Parse lists
+        def _parse_list_value(data_dict, field_name):
+            schema_field = self._schema_field(
+                field_name
+            ) or self._schema_resource_field(field_name)
+
+            if schema_field and "scheming_multiple_text" in schema_field.get(
+                "validators", []
+            ):
+                if isinstance(data_dict[field_name], str):
+                    try:
+                        data_dict[field_name] = json.loads(data_dict[field_name])
+                    except ValueError:
+                        pass
+
+        for field_name in dataset_dict.keys():
+            _parse_list_value(dataset_dict, field_name)
+
+        for resource_dict in dataset_dict.get("resources", []):
+            for field_name in resource_dict.keys():
+                _parse_list_value(resource_dict, field_name)
+
+        # Repeating subfields
+        for schema_field in self._dataset_schema["dataset_fields"]:
+            if "repeating_subfields" in schema_field:
+                # Check if existing extras need to be migrated
+                field_name = schema_field["field_name"]
+                new_extras = []
+                new_dict = {}
+                for extra in dataset_dict.get("extras", []):
+                    if extra["key"].startswith(f"{field_name}_"):
+                        subfield = extra["key"][extra["key"].index("_") + 1 :]
+                        if subfield in [
+                            f["field_name"] for f in schema_field["repeating_subfields"]
+                        ]:
+                            new_dict[subfield] = extra["value"]
+                        else:
+                            new_extras.append(extra)
+                    else:
+                        new_extras.append(extra)
+                if new_dict:
+                    dataset_dict[field_name] = [new_dict]
+                    dataset_dict["extras"] = new_extras
+
+        for schema_field in self._dataset_schema["resource_fields"]:
+            if "repeating_subfields" in schema_field:
+                # Check if value needs to be load from JSON
+                field_name = schema_field["field_name"]
+                for resource_dict in dataset_dict.get("resources", []):
+                    if resource_dict.get(field_name) and isinstance(
+                        resource_dict[field_name], str
+                    ):
+                        try:
+                            # TODO: load only subfields in schema?
+                            resource_dict[field_name] = json.loads(
+                                resource_dict[field_name]
+                            )
+                        except ValueError:
+                            pass
+
+        return dataset_dict
+
+    def graph_from_dataset(self, dataset_dict, dataset_ref):
+
+        contact = dataset_dict.get("contact")
+        if isinstance(contact, list) and len(contact):
+            for item in contact:
+                contact_uri = item.get("uri")
+                if contact_uri:
+                    contact_details = CleanedURIRef(contact_uri)
+                else:
+                    contact_details = BNode()
+
+                self.g.add((contact_details, RDF.type, VCARD.Organization))
+                self.g.add((dataset_ref, DCAT.contactPoint, contact_details))
+
+                self._add_triple_from_dict(item, contact_details, VCARD.fn, "name")
+                # Add mail address as URIRef, and ensure it has a mailto: prefix
+                self._add_triple_from_dict(
+                    item,
+                    contact_details,
+                    VCARD.hasEmail,
+                    "email",
+                    _type=URIRef,
+                    value_modifier=self._add_mailto,
+                )
+
+        publisher = dataset_dict.get("publisher")
+        if isinstance(publisher, list) and len(publisher):
+            publisher = publisher[0]
+            publisher_uri = publisher.get("uri")
+            if publisher_uri:
+                publisher_ref = CleanedURIRef(publisher_uri)
+            else:
+                publisher_ref = BNode()
+
+            self.g.add((publisher_ref, RDF.type, FOAF.Organization))
+            self.g.add((dataset_ref, DCT.publisher, publisher_ref))
+
+            self._add_triple_from_dict(publisher, publisher_ref, FOAF.name, "name")
+            self._add_triple_from_dict(
+                publisher, publisher_ref, FOAF.homepage, "url", URIRef
+            )
+            self._add_triple_from_dict(
+                publisher, publisher_ref, DCT.type, "type", URIRefOrLiteral
+            )
+            self._add_triple_from_dict(
+                publisher,
+                publisher_ref,
+                VCARD.hasEmail,
+                "email",
+                _type=URIRef,
+                value_modifier=self._add_mailto,
+            )
+
+        resources = dataset_dict.get("resources", [])
+        for resource in resources:
+            if resource.get("access_services"):
+                if isinstance(resource["access_services"], str):
+                    try:
+                        resource["access_services"] = json.loads(
+                            resource["access_services"]
+                        )
+                    except ValueError:
+                        pass
diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py
index ffa682b7..e2508e0f 100644
--- a/ckanext/dcat/tests/test_scheming_support.py
+++ b/ckanext/dcat/tests/test_scheming_support.py
@@ -20,9 +20,6 @@
     LOCN,
     GSP,
     OWL,
-    SPDX,
-    GEOJSON_IMT,
-    DISTRIBUTION_LICENSE_FALLBACK_CONFIG,
 )
 from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest