Merge branch 'master' into more-robust-rdf-pagination

ckan · Sep 16, 2024 · cae29ff · cae29ff
2 parents 9396b25 + f18ba7a
commit cae29ff
Show file tree

Hide file tree

Showing 100 changed files with 12,763 additions and 3,268 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,13 +1,15 @@
 name: Tests
+env:
+    COLUMNS: 120
 on: [push, pull_request]
 jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: '3.7'
+          python-version: '3.9'
       - name: Install requirements
         run: pip install flake8 pycodestyle
       - name: Check syntax
@@ -17,16 +19,22 @@ jobs:
     needs: lint
     strategy:
       matrix:
-        ckan-version: ["2.10", 2.9]
+        include:
+          - ckan-version: "2.11"
+            ckan-image: "ckan/ckan-dev:2.11-py3.10"
+          - ckan-version: "2.10"
+            ckan-image: "ckan/ckan-dev:2.10-py3.10"
+          - ckan-version: "2.9"
+            ckan-image: "ckan/ckan-dev:2.9-py3.9"
       fail-fast: false
 
     name: CKAN ${{ matrix.ckan-version }}
     runs-on: ubuntu-latest
     container:
-      image: openknowledge/ckan-dev:${{ matrix.ckan-version }}
+      image: ${{ matrix.ckan-image }}
     services:
       solr:
-        image: ckan/ckan-solr:${{ matrix.ckan-version }}
+        image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9
       postgres:
         image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }}
         env:
@@ -44,28 +52,28 @@ jobs:
       CKAN_REDIS_URL: redis://redis:6379/1
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Install requirements
+    - uses: actions/checkout@v4
+    - name: Install requirements (common)
       run: |
         pip install -r requirements.txt
         pip install -r dev-requirements.txt
-    - name: Install requirements (common)
-      run: |
         pip install -e .
         # Replace default path to CKAN core config file with the one on the container
         sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
-    - name: Setup harvest extension
+    - name: Install requirements (2.9)
+      run: |
+        pip install -U pytest-rerunfailures
+      if: ${{ matrix.ckan-version == '2.9' }}
+    - name: Setup other extensions
       run: |
         git clone https://github.com/ckan/ckanext-harvest
         pip install -e ckanext-harvest
-        pip install -r ckanext-harvest/pip-requirements.txt
+        pip install -r ckanext-harvest/requirements.txt
+        git clone https://github.com/ckan/ckanext-scheming
+        pip install -e ckanext-scheming
     - name: Setup extension
       run: |
         ckan -c test.ini db init
-        ckan -c test.ini harvester initdb
+        ckan -c test.ini db pending-migrations --apply
     - name: Run tests
-      run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=xml --cov-append --disable-warnings ckanext/dcat/tests
-    - name: Upload coverage report to codecov
-      uses: codecov/codecov-action@v1
-      with:
-        file: ./coverage.xml
+      run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,17 @@
+# Read the Docs configuration file for MkDocs projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+
+mkdocs:
+  configuration: mkdocs.yml
+
+python:
+  install:
+  - requirements: docs/requirements.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/README.md b/README.md
diff --git a/ckanext/dcat/cli.py b/ckanext/dcat/cli.py
@@ -1,25 +1,145 @@
 # -*- coding: utf-8 -*-
+import json
 
 import click
+
 import ckan.plugins.toolkit as tk
+
 import ckanext.dcat.utils as utils
+from ckanext.dcat.processors import (
+    RDFParser,
+    RDFSerializer,
+    DEFAULT_RDF_PROFILES,
+    RDF_PROFILES_CONFIG_OPTION,
+)
 
-@click.group()
-def generate_static():
-    """Generates static files containing all datasets.
 
-    """
+@click.group()
+def dcat():
+    """DCAT utilities for CKAN"""
     pass
 
-@generate_static.command()
-@click.argument('output', type=click.File(mode="w"))
-def json(output):
-    """The generate command will generate a static file containing all of
-    the datasets in the catalog in JSON format.
 
+@dcat.command()
+@click.argument("output", type=click.File(mode="w"))
+def generate_static(output):
+    """[Deprecated] Generate a static datasets file in JSON format
+    (requires the dcat_json_interface plugin) .
     """
     utils.generate_static_json(output)
 
 
+def _get_profiles(profiles):
+    if profiles:
+        profiles = profiles.split()
+    elif tk.config.get(RDF_PROFILES_CONFIG_OPTION):
+        profiles = tk.aslist(tk.config[RDF_PROFILES_CONFIG_OPTION])
+    else:
+        profiles = None
+
+    return profiles
+
+
+@dcat.command(context_settings={"show_default": True})
+@click.argument("input", type=click.File(mode="r"))
+@click.option(
+    "-o",
+    "--output",
+    type=click.File(mode="w"),
+    default="-",
+    help="By default the command will output the result to stdin, "
+    "alternatively you can provide a file path with this option",
+)
+@click.option(
+    "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)"
+)
+@click.option(
+    "-p",
+    "--profiles",
+    help=f"RDF profiles to use. If not provided will be read from config, "
+    "if not present there, the default will be used: {DEFAULT_RDF_PROFILES}",
+)
+@click.option(
+    "-P", "--pretty", is_flag=True, help="Make the output more human readable"
+)
+@click.option(
+    "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)"
+)
+def consume(input, output, format, profiles, pretty, compat_mode):
+    """
+    Parses DCAT RDF graphs into CKAN dataset JSON objects.
+
+    The input serializations can be provided as a path to a file, e.g.:
+
+        ckan dcat consume examples/dcat/dataset.ttl
+
+    Or be read from stdin:
+
+        ckan dcat consume -
+    """
+    contents = input.read()
+
+    profiles = _get_profiles(profiles)
+
+    parser = RDFParser(profiles=profiles, compatibility_mode=compat_mode)
+    parser.parse(contents, _format=format)
+
+    ckan_datasets = [d for d in parser.datasets()]
+
+    indent = 4 if pretty else None
+    out = json.dumps(ckan_datasets, indent=indent)
+
+    output.write(out)
+
+
+@dcat.command(context_settings={"show_default": True})
+@click.argument("input", type=click.File(mode="r"))
+@click.option(
+    "-o",
+    "--output",
+    type=click.File(mode="w"),
+    default="-",
+    help="By default the command will output the result to stdin, "
+    "alternatively you can provide a file path with this option",
+)
+@click.option(
+    "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)"
+)
+@click.option(
+    "-p",
+    "--profiles",
+    help=f"RDF profiles to use. If not provided will be read from config, "
+    "if not present there, the default will be used: {DEFAULT_RDF_PROFILES}",
+)
+@click.option(
+    "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)"
+)
+def produce(input, output, format, profiles, compat_mode):
+    """
+    Transforms CKAN dataset JSON objects into DCAT RDF serializations.
+
+    The input datasets can be provided as a path to a file, e.g.:
+
+        ckan dcat consume examples/ckan/ckan_dataset.json
+
+    Or be read from stdin:
+
+        ckan dcat produce -
+    """
+    contents = input.read()
+
+    profiles = _get_profiles(profiles)
+
+    serializer = RDFSerializer(profiles=profiles, compatibility_mode=compat_mode)
+
+    dataset = json.loads(contents)
+    if isinstance(dataset, list):
+        out = serializer.serialize_datasets(dataset, _format=format)
+    else:
+        out = serializer.serialize_dataset(dataset, _format=format)
+
+    output.write(out)
+
+
 def get_commands():
-    return [generate_static]
+    return [dcat]
diff --git a/ckanext/dcat/config_declaration.yml b/ckanext/dcat/config_declaration.yml
@@ -0,0 +1,120 @@
+version: 1
+groups:
+  - annotation: General settings
+    options:
+
+      - key: ckanext.dcat.rdf.profiles
+        default_callable: 'ckanext.dcat.processors:_get_default_rdf_profiles'
+        description: |
+          RDF profiles to use when parsing and serializing. See https://github.com/ckan/ckanext-dcat#profiles
+          for more details.
+        example: 'euro_dcat_ap_2 my_local_ap'
+
+      - key: ckanext.dcat.translate_keys
+        type: bool
+        default: True
+        description: |
+          If set to True, the plugin will automatically translate the keys of the DCAT
+          fields used in the frontend (at least those present in the `ckanext/dcat/i18n`
+          po files).
+
+  - annotation: Parsers / Serializers settings
+    options:
+
+      - key: ckanext.dcat.output_spatial_format
+        type: list
+        default: 
+          - 'wkt'
+        description: |
+          Format to use for geometries when serializing RDF documents. The default is
+          recommended as is the format expected by GeoDCAT, alternatively you can
+          use `geojson` (or both, which will make SHACL validation fail)
+
+      - key: ckanext.dcat.resource.inherit.license
+        type: bool
+        default: False
+        description: |
+          If there is no license defined for a resource / distribution, inherit it from
+          the dataset.
+
+      - key: ckanext.dcat.normalize_ckan_format
+        type: bool
+        default: True
+        description: |
+          When true, the resource label will be tried to match against the standard
+          list of CKAN formats (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json)
+          This allows for instance to populate the CKAN resource format field
+          with a value that view plugins, etc will understand (`csv`, `xml`, etc.)
+
+      - key: ckanext.dcat.clean_tags
+        type: bool
+        default: False
+        description: |
+          Remove special characters from keywords (use the old munge_tag() CKAN function).
+          This is generally not needed.
+
+  - annotation: Endpoints settings
+    options:
+
+      - key: ckanext.dcat.enable_rdf_endpoints
+        default: True
+        description: |
+          Whether to expose the catalog and dataset endpoints with the RDF DCAT
+          serializations.
+        type: bool
+
+      - key: ckanext.dcat.base_uri
+        description: |
+          Base URI to use when generating URIs for all entities. It needs to be a valid URI value.
+        example: 'https://my-site.org/uri/'
+
+      - key: ckanext.dcat.catalog_endpoint
+        default: '/catalog.{_format}'
+        description: |
+          Custom route for the catalog endpoint. It should start with `/` and include the
+          `{_format}` placeholder.
+        example: '/dcat/catalog/{_format}'
+
+      - key: ckanext.dcat.datasets_per_page
+        default: 100
+        type: int
+        description: |
+          Default number of datasets returned by the catalog endpoint.
+
+      - key: ckanext.dcat.enable_content_negotiation
+        default: False
+        type: bool
+        description: |
+          Enable content negotiation in the main catalog and dataset endpoints. Note that
+          setting this to True overrides the core `home.index` and `dataset.read` endpoints.
+
+  - annotation: Harvester settings
+    options:
+
+      - key: ckanext.dcat.max_file_size
+        type: int
+        default: 50
+        description: |
+          Maximum file size that will be downloaded for parsing by the harvesters
+
+      - key: ckanext.dcat.expose_subcatalogs
+        type: bool
+        default: false
+        description: |
+          Store information about the origin catalog when harvesting datasets.
+          See https://github.com/ckan/ckanext-dcat#transitive-harvesting for more details.
+
+  - annotation: Deprecated options (will be removed in future versions)
+    options:
+
+      - key: ckanext.dcat.compatibility_mode
+        type: bool
+        default: False
+        description: |
+          Whether to modify some fields to maintain compatibility with previous versions
+          of the ckanext-dcat parsers.
+
+      - key: ckanext.dcat.json_endpoint
+        default: '/dcat.json'
+        description: |
+          Custom route to expose the legacy JSON endpoint
diff --git a/ckanext/dcat/converters.py b/ckanext/dcat/converters.py
@@ -28,7 +28,15 @@ def dcat_to_ckan(dcat_dict):
         package_dict['extras'].append({'key': 'dcat_publisher_name', 'value': dcat_publisher})
     elif isinstance(dcat_publisher, dict) and dcat_publisher.get('name'):
         package_dict['extras'].append({'key': 'dcat_publisher_name', 'value': dcat_publisher.get('name')})
-        package_dict['extras'].append({'key': 'dcat_publisher_email', 'value': dcat_publisher.get('mbox')})
+
+        if dcat_publisher.get('mbox'):
+            package_dict['extras'].append({'key': 'dcat_publisher_email', 'value': dcat_publisher.get('mbox')})
+
+        if dcat_publisher.get('identifier'):
+            package_dict['extras'].append({
+                'key': 'dcat_publisher_id',
+                'value': dcat_publisher.get('identifier')  # This could be a URI like https://ror.org/05wg1m734
+            })
 
     package_dict['extras'].append({
         'key': 'language',