From e539a33959c1fba1fcb9c85550c80276eb7673bc Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 27 Aug 2024 12:49:27 +0200 Subject: [PATCH 1/8] First draft of the docs --- .readthedocs.yaml | 25 +++ README.md | 58 ------ docs/cli.md | 19 ++ docs/configuration.md | 142 ++++++++++++++ docs/endpoints.md | 155 +++++++++++++++ docs/getting-started.md | 64 ++++++ docs/google-dataset-search.md | 93 +++++++++ docs/harvester.md | 66 +++++++ docs/index.md | 20 ++ docs/mapping.md | 353 ++++++++++++++++++++++++++++++++++ docs/profiles.md | 236 +++++++++++++++++++++++ mkdocs.yml | 20 ++ 12 files changed, 1193 insertions(+), 58 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 docs/cli.md create mode 100644 docs/configuration.md create mode 100644 docs/endpoints.md create mode 100644 docs/getting-started.md create mode 100644 docs/google-dataset-search.md create mode 100644 docs/harvester.md create mode 100644 docs/index.md create mode 100644 docs/mapping.md create mode 100644 docs/profiles.md create mode 100644 mkdocs.yml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..a5a3ba50 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,25 @@ +# Read the Docs configuration file for MkDocs projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + + +# Set the version of Python and other tools you might need + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + + +mkdocs: + configuration: mkdocs.yml + + +# Optionally declare the Python requirements required to build your docs + +#python: +# install: +# - requirements: docs/requirements.txt + diff --git a/README.md b/README.md index cd1b51e1..dfec8b50 100644 --- a/README.md +++ b/README.md @@ -960,64 +960,6 @@ Extensions define their available profiles using the `ckan.rdf.profiles` in the euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile schemaorg=ckanext.dcat.profiles:SchemaOrgProfile -### Command line interface - -The parser and serializer can also be accessed from the command line via `python ckanext-dcat/ckanext/dcat/processors.py`. - -You can point to RDF files: - - python ckanext-dcat/ckanext/dcat/processors.py consume catalog_pod_2.jsonld -P -f json-ld - - python ckanext/dcat/processors.py produce examples/ckan_dataset.json - -or pipe them to the script: - - http http://localhost/dcat/catalog.rdf | python ckanext-dcat/ckanext/dcat/processors.py consume -P > ckan_datasets.json - - http http://demo.ckan.org/api/action/package_show id=afghanistan-election-data | jq .result | python ckanext/dcat/processors.py produce - - -To see all available options, run the script with the `-h` argument: - - python ckanext-dcat/ckanext/dcat/processors.py -h - usage: processors.py [-h] [-f FORMAT] [-P] [-p [PROFILE [PROFILE ...]]] [-m] - mode [file] - - DCAT RDF - CKAN operations - - positional arguments: - mode Operation mode. `consume` parses DCAT RDF graphs to - CKAN dataset JSON objects. `produce` serializes CKAN - dataset JSON objects into DCAT RDF. - file Input file. If omitted will read from stdin - - optional arguments: - -h, --help show this help message and exit - -f FORMAT, --format FORMAT - Serialization format (as understood by rdflib) eg: - xml, n3 ... Defaults to 'xml'. - -P, --pretty Make the output more human readable - -p [PROFILE [PROFILE ...]], --profile [PROFILE [PROFILE ...]] - RDF Profiles to use, defaults to euro_dcat_ap_2 - -m, --compat-mode Enable compatibility mode - - -### Compatibility mode - -In compatibility mode, some fields are modified to maintain compatibility with previous versions of the ckanext-dcat parsers -(eg adding the `dcat_` prefix or storing comma separated lists instead -of JSON blobs). - -CKAN instances that were using the legacy XML and JSON harvesters (`dcat_json_harvester` and `dcat_xml_harvester`) -and want to move to the RDF based one may want to turn compatibility mode on to ensure that CKAN dataset fields are created as before. -Users are encouraged to migrate their applications to support the new DCAT to CKAN mapping. - -To turn compatibility mode on add this to the CKAN configuration file: - - ckanext.dcat.compatibility_mode = True - - - ## XML DCAT harvester (deprecated) The old DCAT XML harvester (`dcat_xml_harvester`) is now deprecated, in favour of the [RDF harvester](#rdf-dcat-harvester). diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 00000000..f9585ede --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,19 @@ +## CLI + +The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and +viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file: + + ckan dcat consume -f ttl examples/dcat/dataset.ttl + + ckan dcat produce -f jsonld examples/ckan/ckan_datasets.json + +or be read from stdin: + + ckan dcat consume - + +The latter form allows chaininig commands for more complex metadata processing, e.g.: + + curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld - + +For the full list of options check `ckan dcat consume --help` and `ckan dcat produce --help`. + diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..3c8d3741 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,142 @@ +## Configuration reference + + + +### General settings + +#### ckanext.dcat.rdf.profiles + +Example: + +``` +ckanext.dcat.rdf.profiles = euro_dcat_ap_2 my_local_ap +``` + +Default value: `euro_dcat_ap_2` + +RDF profiles to use when parsing and serializing. See https://github.com/ckan/ckanext-dcat#profiles +for more details. + + +#### ckanext.dcat.translate_keys + +Default value: `True` + +If set to True, the plugin will automatically translate the keys of the DCAT +fields used in the frontend (at least those present in the `ckanext/dcat/i18n` +po files). + + +### Parsers / Serializers settings + +#### ckanext.dcat.output_spatial_format + +Default value: `wkt` + +Format to use for geometries when serializing RDF documents. The default is +recommended as is the format expected by GeoDCAT, alternatively you can +use `geojson` (or both, which will make SHACL validation fail) + + +#### ckanext.dcat.resource.inherit.license + +Default value: `False` + +If there is no license defined for a resource / distribution, inherit it from +the dataset. + + +#### ckanext.dcat.normalize_ckan_format + +Default value: `True` + +When true, the resource label will be tried to match against the standard +list of CKAN formats (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json) +This allows for instance to populate the CKAN resource format field +with a value that view plugins, etc will understand (`csv`, `xml`, etc.) + + +#### ckanext.dcat.clean_tags + +Default value: `False` + +Remove special characters from keywords (use the old munge_tag() CKAN function). +This is generally not needed. + + +### Endpoints settings + +#### ckanext.dcat.enable_rdf_endpoints + +Default value: `True` + +Whether to expose the catalog and dataset endpoints with the RDF DCAT +serializations. + + +#### ckanext.dcat.catalog_endpoint + +Example: + +``` +ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} +``` + +Default value: `/catalog.{_format}` + +Custom route for the catalog endpoint. It should start with `/` and include the +`{_format}` placeholder. + + +#### ckanext.dcat.dataset_per_page + +Default value: `100` + +Default number of datasets returned by the catalog endpoint. + + +#### ckanext.dcat.enable_content_negotiation + +Default value: `False` + +Enable content negotiation in the main catalog and dataset endpoints. Note that +setting this to True overrides the core `home.index` and `dataset.read` endpoints. + + +### Harvester settings + +#### ckanext.dcat.max_file_size + +Default value: `50` + +Maximum file size that will be downloaded for parsing by the harvesters + + +#### ckanext.dcat.expose_subcatalogs + +Default value: `False` + +Store information about the origin catalog when harvesting datasets. +See https://github.com/ckan/ckanext-dcat#transitive-harvesting for more details. + + +### Deprecated options (will be removed in future versions) + +#### ckanext.dcat.compatibility_mode + +Default value: `False` + +Whether to modify some fields to maintain compatibility with previous versions +of the ckanext-dcat parsers. + + +#### ckanext.dcat.json_endpoint + +Default value: `/dcat.json` + +Custom route to expose the legacy JSON endpoint + + + + + diff --git a/docs/endpoints.md b/docs/endpoints.md new file mode 100644 index 00000000..bfdf36d5 --- /dev/null +++ b/docs/endpoints.md @@ -0,0 +1,155 @@ +# RDF DCAT endpoints + +By default when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](#profiles). + +To disable the RDF endpoints, you can set the following config in your ini file: + + ckanext.dcat.enable_rdf_endpoints = False + + +## Dataset endpoints + +RDF representations of a particular dataset can accessed using the following endpoint: + + https://{ckan-instance-host}/dataset/{dataset-id}.{format} + +The extension will determine the RDF serialization format returned. The currently supported values are: + +| Extension | Format | Media Type | +|-----------|-------------------------------------------------------------|---------------------| +| `xml` | [RDF/XML](https://en.wikipedia.org/wiki/RDF/XML) | application/rdf+xml | +| `ttl` | [Turtle](https://en.wikipedia.org/wiki/Turtle_%28syntax%29) | text/turtle | +| `n3` | [Notation3](https://en.wikipedia.org/wiki/Notation3) | text/n3 | +| `jsonld` | [JSON-LD](http://json-ld.org/) | application/ld+json | + +The fallback `rdf` format defaults to RDF/XML. + +Here's an example of the different formats: + +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.rdf +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.ttl +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.n3 +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld + +RDF representations will be advertised using `` tags on the `` sectionon the dataset page source code, eg: + + + + + + + + + + +Check the [RDF DCAT Serializer](#rdf-dcat-serializer) section for more details about how these are generated and how to customize the output using [profiles](#profiles). + + +You can specify the profile by using the `profiles=,` query parameter on the dataset endpoint (as a comma-separated list): + +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml?profiles=euro_dcat_ap +* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld?profiles=schemaorg + +*Note*: When using this plugin, the above endpoints will replace the old deprecated ones that were part of CKAN core. + + +## Catalog endpoint + +Additionally to the individual dataset representations, the extension also offers a catalog-wide endpoint for retrieving multiple datasets at the same time (the datasets are paginated, see below for details): + + https://{ckan-instance-host}/catalog.{format}?[page={page}]&[modified_since={date}]&[profiles={profile1},{profile2}]&[q={query}]&[fq={filter query}] + +This endpoint can be customized if necessary using the `ckanext.dcat.catalog_endpoint` configuration option, eg: + + ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} + +The custom endpoint **must** start with a forward slash (`/`) and contain the `{_format}` placeholder. + +As described previously, the extension will determine the RDF serialization format returned. + +* http://demo.ckan.org/catalog.rdf +* http://demo.ckan.org/catalog.xml +* http://demo.ckan.org/catalog.ttl + +RDF representations will be advertised using `` tags on the `` sectionon the homepage and the dataset search page source code, eg: + + + + + + + + + + + +The number of datasets returned is limited. The response will include paging info, serialized using the [Hydra](http://www.w3.org/ns/hydra/spec/latest/core/) vocabulary. The different terms are self-explanatory, and can be used by clients to iterate the catalog: + + @prefix hydra: . + + a hydra:PagedCollection ; + hydra:first "http://example.com/catalog.ttl?page=1" ; + hydra:last "http://example.com/catalog.ttl?page=3" ; + hydra:next "http://example.com/catalog.ttl?page=2" ; + hydra:totalItems 283 . + +The default number of datasets returned (100) can be modified by CKAN site maintainers using the following configuration option on your ini file: + + ckanext.dcat.datasets_per_page = 20 + +The catalog endpoint also supports a `modified_since` parameter to restrict datasets to those modified from a certain date. The parameter value should be a valid ISO-8601 date: + +http://demo.ckan.org/catalog.xml?modified_since=2015-07-24 + +It's possible to specify the profile(s) to use for the serialization using the `profiles` parameter: + +http://demo.ckan.org/catalog.xml?profiles=euro_dcat_ap,sweden_dcat_ap + +To filter the output, the catalog endpoint supports the `q` and `fq` parameters to specify a [search query](https://lucene.apache.org/solr/guide/6_6/the-dismax-query-parser.html#TheDisMaxQueryParser-TheqParameter) or [filter query](https://lucene.apache.org/solr/guide/6_6/common-query-parameters.html#CommonQueryParameters-Thefq_FilterQuery_Parameter): + +http://demo.ckan.org/catalog.xml?q=budget +http://demo.ckan.org/catalog.xml?fq=tags:economy + + + +## URIs + +Whenever possible, URIs are generated for the relevant entities. To try to generate them, the extension will use the first found of the following for each entity: + +* Catalog: + - `ckanext.dcat.base_uri` configuration option value. This is the recommended approach. Value should be a valid URI + - `ckan.site_url` configuration option value. + - 'http://' + `app_instance_uuid` configuration option value. This is not recommended, and a warning log message will be shown. + +* Dataset: + - The value of the `uri` field (note that this is not included in the default CKAN schema) + - The value of an extra with key `uri` + - Catalog URI (see above) + '/dataset/' + `id` field + +* Resource: + - The value of the `uri` field (note that this is not included in the default CKAN schema) + - Catalog URI (see above) + '/dataset/' + `package_id` field + '/resource/ + `id` field + +Note that if you are using the [RDF DCAT harvester](#rdf-dcat-harvester) to import datasets from other catalogs and these define a proper URI for each dataset or resource, these will be stored as `uri` fields in your instance, and thus used when generating serializations for them. + + +## Content negotiation + +The extension supports returning different representations of the datasets based on the value of the `Accept` header ([Content negotiation](https://en.wikipedia.org/wiki/Content_negotiation)). + +When enabled, client applications can request a particular format via the `Accept` header on requests to the main dataset page, eg: + + curl https://{ckan-instance-host}/dataset/{dataset-id} -H Accept:text/turtle + + curl https://{ckan-instance-host}/dataset/{dataset-id} -H Accept:"application/rdf+xml; q=1.0, application/ld+json; q=0.6" + +This is also supported on the [catalog endpoint](#catalog-endpoint), in this case when making a request to the CKAN root URL (home page). This won't support the pagination and filter parameters: + + curl https://{ckan-instance-host} -H Accept:text/turtle + +Note that this feature overrides the CKAN core home page and dataset page controllers, so you probably don't want to enable it if your own extension is also doing it. + +To enable content negotiation, set the following configuration option on your ini file: + + ckanext.dcat.enable_content_negotiation = True diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 00000000..cbe6e134 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,64 @@ +# Getting started + +## Installation + + +1. Install the extension on your virtualenv: + + (pyenv) $ pip install -e git+https://github.com/ckan/ckanext-dcat.git#egg=ckanext-dcat + +2. Install the extension requirements: + + (pyenv) $ pip install -r ckanext-dcat/requirements.txt + +3. Enable the required plugins in your ini file: + + ckan.plugins = dcat dcat_rdf_harvester structured_data + +4. To use the pre-built schemas, install [ckanext-scheming](https://github.com/ckan/ckanext-scheming): + + pip install -e "git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming" + +Check the [Schemas](#schemas) section for extra configuration needed. + +Optionally, if you want to use the RDF harvester, install ckanext-harvest as well ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)). + +For further configuration options available, see [Configuration reference](#configuration-reference). + +## Schemas + +The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas that enable DCAT support. These include a schema definition file (located in `ckanext/dcat/schemas`) plus extra validators and other custom logic that integrates the metadata modifications with the RDF DCAT [Parsers](#rdf-dcat-parser) and [Serializers](#rdf-dcat-serializer) and other CKAN features and extensions. + +There are the following schemas currently included with the extension: + +* *dcat_ap_2.1_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. +* *dcat_ap_2.1_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. + +Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. + +In any case, the schema file used should be defined in the configuration file, alongside these configuration options: + + # Make sure to add scheming_datasets after the dcat plugin + ckan.plugins = activity dcat [...] scheming_datasets + + # Point to one of the defaults or your own version of the schema file + scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml + + # Include the dcat presets as well as the standard scheming ones + scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml + + # Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the + # euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) + ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming + +### Compatibility with existing profiles + +Sites using the existing `euro_dcat_ap` and `euro_dcat_ap_2` profiles should not see any change in their +current parsing and serialization functionalities and these profiles will not change their outputs going +forward (unless a bug is being fixed). Sites willing to migrate to a scheming based metadata schema can do +so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. +`ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile +outputs to the expected format by the scheming validators. + +Note that the scheming profile will only affect fields defined in the schema definition file, so sites can start migrating gradually different metadata fields. + diff --git a/docs/google-dataset-search.md b/docs/google-dataset-search.md new file mode 100644 index 00000000..78c20554 --- /dev/null +++ b/docs/google-dataset-search.md @@ -0,0 +1,93 @@ +## Structured data and Google Dataset Search indexing + +There are plugins available to add [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) to dataset pages to provide richer metadata for search engines crawling your site. One of the most well known is [Google Dataset Search](https://toolbox.google.com/datasetsearch). The `structured_data` plugin will add the necessary markup in order to get your datasets indexed by Google Dataset Search. This markup is a JSON-LD snippet that uses the [schema.org](https://schema.org) vocabulary to describe the dataset. + +To add [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) to dataset pages, activate the `structured_data` and `dcat` plugins in your ini file: + + ckan.plugins = dcat structured_data + +By default this uses the `schemaorg` profile (see [profiles](#profiles)) to serialize the dataset to JSON-LD, which is then added to the dataset detail page. +To change the schema, you have to override the Jinja template block called `structured_data` in [`templates/package/read_base.html`](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/templates/package/read_base.html) and call the template helper function with different parameters: + + {% block structured_data %} + + {% endblock %} + +Example output of structured data in JSON-LD: + + < ... > + + + + diff --git a/docs/harvester.md b/docs/harvester.md new file mode 100644 index 00000000..1d9e2a9b --- /dev/null +++ b/docs/harvester.md @@ -0,0 +1,66 @@ +## RDF DCAT harvester + +The RDF parser described in the previous section has been integrated into a harvester, +to allow automatic import of datasets from remote sources. To enable the RDF harvester, add the `dcat_rdf_harvester` plugin to your CKAN configuration file: + + ckan.plugins = ... dcat_rdf_harvester + +The harvester will download the remote file, extract all datasets using the parser and create or update actual CKAN datasets based on that. +It will also handle deletions, ie if a dataset is not present any more in the DCAT dump anymore it will get deleted from CKAN. + +The harvester will look at the `content-type` HTTP header field to determine the used RDF format. Any format understood by the [RDFLib](https://rdflib.readthedocs.org/en/stable/plugin_parsers.html) library can be parsed. It is possible to override this functionality and provide a specific format using the harvester configuration. This is useful when the server does not return the correct `content-type` or when harvesting a file on the local file system without a proper extension. The harvester configuration is a JSON object that you fill into the harvester configuration form field. + + {"rdf_format":"text/turtle"} + +*TODO*: configure profiles. + +### Maximum file size + +The default max size of the file (for each HTTP response) to harvest is actually 50 MB. The size can be customised by setting the configuration option `ckanext.dcat.max_file_size` to your CKAN configuration file. +Here‘s an example of setting the max file size to 100 MB: + +`ckanext.dcat.max_file_size = 100` + +### Transitive harvesting + +In transitive harvesting (i.e., when you harvest a catalog A, and a catalog X harvests your catalog), you may want to provide the original catalog info for each harvested dataset. + +By setting the configuration option `ckanext.dcat.expose_subcatalogs = True` in your ini file, you'll enable the storing and publication of the source catalog for each harvested dataset. + +The information contained in the harvested `dcat:Catalog` node will be stored as extras into the harvested datasets. +When serializing, your Catalog will expose the harvested Catalog using the `dct:hasPart` relation. This means that your catalog will have this structure: +- `dcat:Catalog` (represents your current catalog) + - `dcat:dataset` (1..n, the dataset created withing your catalog) + - `dct:hasPart` + - `dcat:Catalog` (info of one of the harvested catalogs) + - `dcat:dataset` (dataset in the harvested catalog) + - `dct:hasPart` + - `dcat:Catalog` (info of one of another harvester catalog) + ... + + +### Extending the RDF harvester + +The DCAT RDF harvester has extension points that allow to modify its behaviour from other extensions. These can be used by extensions implementing +the `IDCATRDFHarvester` interface. Right now it provides the following methods: + +* `before_download` and `after_download`: called just before and after retrieving the remote file, and can be used for instance to validate the contents. +* `update_session`: called before making the remote requests to update the `requests` session object, useful to add additional headers or for setting client certificates. Check the [`requests` documentation](http://docs.python-requests.org/en/master/user/advanced/#session-objects) for details. +* `before_create` / `after_create`: called before and after the `package_create` action has been performed +* `before_update` / `after_update`: called before and after the `package_update` action has been performed +* `after_parsing`: Called just after the content from the remote RDF file has been parsed + +To know more about these methods, please check the source of [`ckanext-dcat/ckanext/dcat/interfaces.py`](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/interfaces.py). + +## JSON DCAT harvester + +The DCAT JSON harvester supports importing JSON objects that are based on DCAT terms but are not defined as JSON-LD. The exact format for these JSON files +is the one described in the [spec.dataportals.org](http://spec.dataportals.org/#datasets-serialization-format) site. There are [example files](https://github.com/ckan/ckanext-dcat/blob/master/examples/dataset.json) in the `examples` folder. + +To enable the JSON harvester, add the `dcat_json_harvester` plugin to your CKAN configuration file: + + ckan.plugins = ... dcat_json_harvester + +*TODO*: align the fields created by this harvester with the base mapping (ie the ones created by the RDF harvester). + + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..aaf6ce03 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,20 @@ +# ckanext-dcat + + +[![Tests](https://github.com/ckan/ckanext-dcat/workflows/Tests/badge.svg?branch=master)](https://github.com/ckan/ckanext-dcat/actions) +[![Code Coverage](http://codecov.io/github/ckan/ckanext-dcat/coverage.svg?branch=master)](http://codecov.io/github/ckan/ckanext-dcat?branch=master) + + +This extension provides plugins that allow CKAN to expose its metadata and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: + +[http://www.w3.org/TR/vocab-dcat](http://www.w3.org/TR/vocab-dcat) + +It also offers other features related to Semantic Data like exposing the necessary markup to get your datasets indexed in [Google Dataset Search](https://toolbox.google.com/datasetsearch). + +!!! todo + Overview with standards supported and high level concepts + + +Check the [overview](#overview) section for a summary of the available features. + + diff --git a/docs/mapping.md b/docs/mapping.md new file mode 100644 index 00000000..86b671c0 --- /dev/null +++ b/docs/mapping.md @@ -0,0 +1,353 @@ +## RDF DCAT to CKAN dataset mapping + +The following table provides a generic mapping between the fields of the `dcat:Dataset` and `dcat:Distribution` classes and +their equivalents in the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link +the DCAT publisher property with a CKAN dataset author, maintainer or organization, as the link between them is not straight-forward +and may depend on a particular instance needs. When mapping from CKAN metadata to DCAT though, there are in some cases fallback fields +that are used if the default field is not present (see [RDF Serializer](#rdf-dcat-serializer) for more details on this. + +This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). It depends on the active profile(s) (see [Profiles](#profiles)) which DCAT properties are mapped. + +Sites are encouraged to use ckanext-scheming to manage their metadata schema (see [Schemas](#schemas) for all details). This changes in +some cases the way metadata is stored internally and presented at the CKAN API level, but should not affect the RDF DCAT output. + +| DCAT class | DCAT property | CKAN dataset field | CKAN fallback fields | Stored as | | +|-------------------|------------------------|-------------------------------------------|--------------------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| dcat:Dataset | - | extra:uri | | text | See [URIs](#uris-1) | +| dcat:Dataset | dct:title | title | | text | | +| dcat:Dataset | dct:description | notes | | text | | +| dcat:Dataset | dcat:keyword | tags | | text | | +| dcat:Dataset | dcat:theme | extra:theme | | list | See [Lists](#lists) | +| dcat:Dataset | dct:identifier | extra:identifier | extra:guid, id | text | | +| dcat:Dataset | adms:identifier | extra:alternate_identifier | | text | | +| dcat:Dataset | dct:issued | extra:issued | metadata_created | text | | +| dcat:Dataset | dct:modified | extra:modified | metadata_modified | text | | +| dcat:Dataset | owl:versionInfo | version | extra:dcat_version | text | | +| dcat:Dataset | adms:versionNotes | extra:version_notes | | text | | +| dcat:Dataset | dct:language | extra:language | | list | See [Lists](#lists) | +| dcat:Dataset | dcat:landingPage | url | | text | | +| dcat:Dataset | dct:accrualPeriodicity | extra:frequency | | text | | +| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See [Lists](#lists) | +| dcat:Dataset | dct:accessRights | extra:access_rights | | text | | +| dcat:Dataset | foaf:page | extra:documentation | | list | See [Lists](#lists) | +| dcat:Dataset | dct:provenance | extra:provenance | | text | | +| dcat:Dataset | dct:type | extra:dcat_type | | text | As of DCAT-AP v1.1 there's no controlled vocabulary for this field | +| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:source | extra:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | adms:sample | extra:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | +| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See [Spatial coverage](#spatial-coverage) | +| dcat:Dataset | dct:temporal | extra:temporal_start + extra:temporal_end | | text | None, one or both extras can be present | +| dcat:Dataset | dcat:temporalResolution| extra:temporal_resolution | | list | | +| dcat:Dataset | dcat:spatialResolutionInMeters| extra:spatial_resolution_in_meters | | list | | +| dcat:Dataset | dct:isReferencedBy | extra:is_referenced_by | | list | | +| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See [URIs](#uris-1) and [Publisher](#contact-points-and-publisher) | +| foaf:Agent | foaf:name | extra:publisher_name | | text | | +| foaf:Agent | foaf:mbox | extra:publisher_email | organization:title | text | | +| foaf:Agent | foaf:homepage | extra:publisher_url | | text | | +| foaf:Agent | dct:type | extra:publisher_type | | text | | +| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See [URIs](#uris-1) and [Contact points](#contact-points-and-publisher) | +| vcard:Kind | vcard:fn | extra:contact_name | maintainer, author | text | | +| vcard:Kind | vcard:hasEmail | extra:contact_email | maintainer_email, author_email | text | | +| dcat:Dataset | dcat:distribution | resources | | text | | +| dcat:Distribution | - | resource:uri | | text | See [URIs](#uris-1) | +| dcat:Distribution | dct:title | resource:name | | text | | +| dcat:Distribution | dcat:accessURL | resource:access_url | resource:url | text | If downloadURL is not present, accessURL will be used as resource url | +| dcat:Distribution | dcat:downloadURL | resource:download_url | | text | If present, downloadURL will be used as resource url | +| dcat:Distribution | dct:description | resource:description | | text | | +| dcat:Distribution | dcat:mediaType | resource:mimetype | | text | | +| dcat:Distribution | dct:format | resource:format | | text | | +| dcat:Distribution | dct:license | resource:license | | text | See [Licenses](#licenses) | +| dcat:Distribution | adms:status | resource:status | | text | | +| dcat:Distribution | dcat:byteSize | resource:size | | number | | +| dcat:Distribution | dct:issued | resource:issued | created | text | | +| dcat:Distribution | dct:modified | resource:modified | metadata_modified | text | | +| dcat:Distribution | dct:rights | resource:rights | | text | | +| dcat:Distribution | foaf:page | resource:documentation | | list | See [Lists](#lists) | +| dcat:Distribution | dct:language | resource:language | | list | See [Lists](#lists) | +| dcat:Distribution | dct:conformsTo | resource:conforms_to | | list | See [Lists](#lists) | +| dcat:Distribution | dcatap:availability | resource:availability | | text | | +| dcat:Distribution | dcat:compressFormat | resource:compress_format | | text | | +| dcat:Distribution | dcat:packageFormat | resource:package_format | | text | | +| dcat:Distribution | dcat:accessService | resource:access_services | | text | | +| dcat:DataService | dct:title | access_service:title | | text | | +| dcat:DataService | dcat:endpointURL | access_service:endpoint_url | | list | | +| dcat:DataService | dcat:endpointDescription| access_service:endpoint_description | | text | | +| dcat:DataService | dcatap:availability | access_service:availability | | text | | +| dcat:DataService | dcat:servesDataset | access_service:serves_dataset | | list | | +| dcat:DataService | dct:description | access_service:description | | text | | +| dcat:DataService | dct:license | access_service:license | | text | | +| dcat:DataService | dct:accessRights | access_service:access_rights | | text | | +| spdx:Checksum | spdx:checksumValue | resource:hash | | text | | +| spdx:Checksum | spdx:algorithm | resource:hash_algorithm | | text | | + +*Notes* + +### Custom fields + +Fields marked as `extra:` are stored as free form extras in the `euro_dcat_ap` and `euro_dcat_ap_2` profiles, +but stored as first level custom fields when using the scheming based profile (`euro_dcat_ap_scheming`), i.e: + + ```json + { + "name": "test_dataset_dcat", + "extras": [ + {"key": "version_notes", "value": "Some version notes"} + ] + } + ``` + + vs: + + ```json + { + "name": "test_dataset_dcat", + "version_notes": "Some version notes" + } + ``` + +### URIs + +Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. +For instance: + + ```xml + + + + + Dataset 1 + + + Publishing Organization for dataset 1 + + + + + + ``` + + ```json + { + "title": "Dataset 1", + "extras": [ + {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"}, + {"key": "publisher_uri", "value": "http://orgs.vocab.org/some-org"}, + {"key": "publisher_name", "value": "Publishing Organization for dataset 1"} + ] + } + ``` + + Another example: + + ``` + @prefix dcat: . + @prefix dct: . + @prefix rdf: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dcat:distribution + . + + + + a dcat:Distribution ; + dct:title "Distribution for dataset 1" ; + dcat:accessURL . + ``` + + ```json + { + "title": "Dataset 1", + "extras": [ + {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} + ], + "resources": [{ + "name": "Distribution for dataset 1", + "url": "http://data.some.org/catalog/datasets/1/downloads/1.csv", + "uri": "http://data.some.org/catalog/datasets/1/d/1" + }] + } + ``` + +### Lists + +On the legacy profiles, lists are stored as a JSON string, eg: + + ``` + @prefix dcat: . + @prefix dct: . + @prefix rdf: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:language "ca" , "en" , "es" ; + dcat:theme "http://eurovoc.europa.eu/100142" , "http://eurovoc.europa.eu/209065", "Earth Sciences" ; + ``` + + ```json + { + "title": "Dataset 1", + "extras": [ + {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} + {"key": "language", "value": "[\"ca\", \"en\", \"es\"]"} + {"key": "theme", "value": "[\"Earth Sciences\", \"http://eurovoc.europa.eu/209065\", \"http://eurovoc.europa.eu/100142\"]"} + ], + } + ``` + +On the scheming-based ones, these are shown as actual lists: + + ```json + { + "title": "Dataset 1", + "uri": "http://data.some.org/catalog/datasets/1"}, + "language": ["ca", "en", "es"] + "theme": ["Earth Sciences", "http://eurovoc.europa.eu/209065", "http://eurovoc.europa.eu/100142"] + } + ``` +### Contact points and Publisher + +Properties for `dcat:contactPoint` and `dct:publisher` are stored as namespaced extras in the legacy profiles. When using +a scheming-based profile, these are stored as proper objects (and multiple instances are allowed for contact point): + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "extras": [ + {"key":"contact_name","value":"PointofContact"}, + {"key":"contact_email","value":"contact@some.org"} + ], +} +``` + +vs: + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "contact": [ + { + "name": "Point of Contact 1", + "email": "contact1@some.org" + }, + { + "name": "Point of Contact 2", + "email": "contact2@some.org" + }, + ] +} +``` + +If no `publisher` or `publisher_*` fields are found, the serializers will fall back to getting the publisher properties from the organization the CKAN dataset belongs to. The organization schema can be customized with the schema located in `ckanext/dcat/schemas/publisher_organization.yaml` to provide the extra properties supported (this will additionally require loading the `scheming_organizations` plugin in `ckan.plugins`). + + +### Spatial coverage + + +The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. + + - One `dct:spatial` instance, URI only + + ```xml + + ``` + + - One `dct:spatial` instance with text (this should not be used anyway) + + ```xml + Newark + ``` + + - One `dct:spatial` instance with label and/or geometry + + ```xml + + + + {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} + + + POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) + + Newark + + + ``` + + - Multiple `dct:spatial` instances (as in GeoDCAT-AP) + + ```xml + + + + + {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} + + + POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) + + + + + + Newark + + + ``` +If the RDF provides them, profiles should store the textual and geometric representation of the location in: + +* For legacy profiles in `spatial_text`, `spatial_bbox`, `spatial_centroid` or `spatial` (for any other geometries) extra fields +* For scheming-based profiles in objects in the `spatial_coverage` field, for instance: + +```json +{ + "name": "test_dataset_dcat", + "title": "Test dataset DCAT", + "spatial_coverage": [ + { + "geom": { + "type": "Polygon", + "coordinates": [...] + }, + "text": "Tarragona", + "uri": "https://sws.geonames.org/6361390/", + "bbox": { + "type": "Polygon", + "coordinates": [ + [ + [-2.1604, 42.7611], + [-2.0938, 42.7611], + [-2.0938, 42.7931], + [-2.1604, 42.7931], + [-2.1604, 42.7611], + ] + ], + }, + "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, + } + ] +} +``` + + +### Licenses + +On the CKAN model, license is at the dataset level whereas in DCAT model it + is at distributions level. By default the RDF parser will try to find a + distribution with a license that matches one of those registered in CKAN + and attach this license to the dataset. The first matching distribution's + license is used, meaning that any discrepancy accross distributions license + will not be accounted for. This behavior can be customized by overridding the + `_license` method on a custom profile. + + diff --git a/docs/profiles.md b/docs/profiles.md new file mode 100644 index 00000000..dfdadb7c --- /dev/null +++ b/docs/profiles.md @@ -0,0 +1,236 @@ +## RDF DCAT Parser + +The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different +formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions +and create CKAN datasets and resources, as dictionaries that can be passed to [`package_create`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.create.package_create) or [`package_update`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.update.package_update). + +Here is a quick overview of how it works: + +```python + + from ckanext.dcat.processors import RDFParser, RDFParserException + + parser = RDFParser() + + # Parsing a local RDF/XML file + + with open('datasets.rdf', 'r') as f: + try: + parser.parse(f.read()) + + for dataset in parser.datasets(): + print('Got dataset with title {0}'.format(dataset['title']) + + except RDFParserException, e: + print ('Error parsing the RDF file: {0}'.format(e)) + + # Parsing a remote JSON-LD file + + import requests + + parser = RDFParser() + + content = requests.get('https://some.catalog.org/datasets.jsonld').content + + try: + parser.parse(content, _format='json-ld') + + for dataset in parser.datasets(): + print('Got dataset with title {0}'.format(dataset['title']) + + except RDFParserException, e: + print ('Error parsing the RDF file: {0}'.format(e)) + +``` + +The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Python library for working with RDF. Any +RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains +serializations in different formats including RDF/XML, Turtle or JSON-LD. + +## RDF DCAT Serializer + +The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different +formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). + +Here is an example of how to use it: + +```python + + from ckanext.dcat.processors import RDFSerializer + + # Serializing a single dataset + + dataset = get_action('package_show')({}, {'id': 'my-dataset'}) + + serializer = RDFserializer() + + dataset_ttl = serializer.serialize_dataset(dataset, _format='turtle') + + + # Serializing the whole catalog (or rather part of it) + + datasets = get_action('package_search')({}, {'q': '*:*', 'rows': 50}) + + serializer = RDFserializer() + + catalog_xml = serializer.serialize_catalog({'title': 'My catalog'}, + dataset_dicts=datasets, + _format='xml') + + # Creating and RDFLib graph from a single dataset + + dataset = get_action('package_show')({}, {'id': 'my-dataset'}) + + serializer = RDFserializer() + + dataset_reference = serializer.graph_from_dataset(dataset) + + # serializer.g now contains the full dataset graph, an RDFLib Graph class + +``` + +The serializer uses customizable [profiles](#profiles) to generate an RDF graph (an [RDFLib Graph class](https://rdflib.readthedocs.org/en/latest/apidocs/rdflib.html#rdflib.graph.Graph)). +By default these use the [mapping](#rdf-dcat-to-ckan-dataset-mapping) described in the previous section. + +In some cases, if the default CKAN field that maps to a DCAT property is not present, some other fallback +values will be used instead. For instance, if the `contact_email` field is not found, `maintainer_email` +and `author_email` will be used (if present) for the email property of the `adms:contactPoint` property. + +Note that the serializer will look both for a first level field or an extra field with the same key, ie both +the following values will be used for `dct:accrualPeriodicity`: + + { + "name": "my-dataset", + "frequency": "monthly", + ... + } + + { + "name": "my-dataset", + "extras": [ + {"key": "frequency", "value": "monthly"}, + ] + ... + } + +Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), +so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). + +### Inherit license from the dataset as fallback in distributions +It is possible to inherit the license from the dataset to the distributions, but only if there is no license defined in the resource yet. By default the license is not inherited from the dataset. This can be activated by setting the following parameter in the CKAN config file: + + ckanext.dcat.resource.inherit.license = True + + +## Profiles + +Both the parser and the serializer use profiles to allow customization of how the values defined in the RDF graph are mapped to CKAN and viceversa. + +Profiles define : + +* How the RDF graph values map into CKAN fields, ie how the RDF is parsed into CKAN datasets +* How CKAN fields map to an RDF graph, which can be then serialized +* How the CKAN catalog metadata maps to an RDF graph, which can be then serialized + +They essentially define the mapping between DCAT and CKAN. + +In most cases the default profile will provide a good mapping that will cover most properties described in the DCAT standard. If you want to extract extra fields defined in the RDF, are using a custom schema or +need custom logic, you can write a custom to profile that extends or replaces the default one. + +The default profile is mostly based in the +[DCAT application profile for data portals in Europe](https://joinup.ec.europa.eu/asset/dcat_application_profile/description). It is actually fully-compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), and partially compatible with [DCAT-AP v2.1.0](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). As mentioned before though, it should be generic enough for most DCAT based representations. + +Sites that want to support a particular version of the DCAT-AP can enable a specific profile using one of the methods below: + +* DCAT-AP v2.1.0 (default): `euro_dcat_ap_2` +* DCAT-AP v1.1.1: `euro_dcat_ap` + +This plugin also contains a profile to serialize a CKAN dataset to a [schema.org Dataset](http://schema.org/Dataset) called `schemaorg`. This is especially useful to provide [JSON-LD structured data](#structured-data). + +To define which profiles to use you can: + +1. Set the `ckanext.dcat.rdf.profiles` configuration option on your CKAN configuration file: + + ckanext.dcat.rdf.profiles = euro_dcat_ap sweden_dcat_ap + +2. When initializing a parser or serializer class, pass the profiles to be used as a parameter, eg: + +```python + + parser = RDFParser(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) + + serializer = RDFSerializer(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) +``` + +Note that in both cases the order in which you define them is important, as it will be the one that the profiles will be run on. + + +### Writing custom profiles + +Internally, profiles are classes that define a particular set of methods called during the parsing process. +For instance, the `parse_dataset` method is called on each DCAT dataset found when parsing an RDF file, and should return a CKAN dataset. +Conversely, the `graph_from_dataset` will be called when requesting an RDF representation for a dataset, and will need to generate the necessary RDF graph. + +Custom profiles should always extend the `ckanext.dcat.profiles.RDFProfile` class. This class has several helper +functions to make getting metadata from the RDF graph easier. These include helpers for getting fields for FOAF and VCard entities like the ones +used to define publishers or contact points. Check the source code of `ckanex.dcat.profiles.py` to see what is available. + +Profiles can extend other profiles to avoid repeating rules, or can be completely independent. + +The following example shows a complete example of a profile built on top of the European DCAT-AP profile (`euro_dcat_ap`): + +```python + + from rdflib.namespace import Namespace + from ckanext.dcat.profiles import RDFProfile + + DCT = Namespace("http://purl.org/dc/terms/") + + + class SwedishDCATAPProfile(RDFProfile): + ''' + An RDF profile for the Swedish DCAT-AP recommendation for data portals + + It requires the European DCAT-AP profile (`euro_dcat_ap`) + ''' + + def parse_dataset(self, dataset_dict, dataset_ref): + + # Spatial label + spatial = self._object(dataset_ref, DCT.spatial) + if spatial: + spatial_label = self.g.label(spatial) + if spatial_label: + dataset_dict['extras'].append({'key': 'spatial_text', + 'value': str(spatial_label)}) + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + g = self.g + + spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') + spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') + + if spatial_uri: + spatial_ref = URIRef(spatial_uri) + else: + spatial_ref = BNode() + + if spatial_text: + g.add((dataset_ref, DCT.spatial, spatial_ref)) + g.add((spatial_ref, RDF.type, DCT.Location)) + g.add((spatial_ref, RDFS.label, Literal(spatial_text))) +``` + +Note how the dataset dict is passed between profiles so it can be further tweaked. + +Extensions define their available profiles using the `ckan.rdf.profiles` in the `setup.py` file, as in this [example](https://github.com/ckan/ckanext-dcat/blob/cc5fcc7be0be62491301db719ce597aec7c684b0/setup.py#L37:L38) from this same extension: + + [ckan.rdf.profiles] + euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile + euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile + schemaorg=ckanext.dcat.profiles:SchemaOrgProfile + + diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..49cb6597 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,20 @@ +site_name: ckanext-dcat +site_url: https://docs.ckan.org/projects/ckanext-dcat +theme: readthedocs + #theme: material + #theme: + # name: material +markdown_extensions: + - admonition + + +nav: + - 'index.md' + - 'getting-started.md' + - 'endpoints.md' + - DCAT CKAN mapping: 'mapping.md' + - 'profiles.md' + - 'harvester.md' + - Google Dataset Search: 'google-dataset-search.md' + - CLI: 'cli.md' + - Configuration reference: 'configuration.md' From 7fdc265492522d8b61d18ac6025497be9fcc13d0 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 28 Aug 2024 14:24:05 +0200 Subject: [PATCH 2/8] Switch to Material for Mkdocs, setup --- docs/_assets/ckan.ico | Bin 0 -> 1150 bytes docs/_assets/logo.png | Bin 0 -> 824 bytes docs/_css/extra.css | 27 ++++++++++++++++ mkdocs.yml | 71 +++++++++++++++++++++++++++++++++--------- 4 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 docs/_assets/ckan.ico create mode 100644 docs/_assets/logo.png create mode 100644 docs/_css/extra.css diff --git a/docs/_assets/ckan.ico b/docs/_assets/ckan.ico new file mode 100644 index 0000000000000000000000000000000000000000..0d9295c77aa301fcd6751a4a499873e80a063388 GIT binary patch literal 1150 zcmZQzU<5(|0R|wcz>vYhz#zuJz@P!dKp~(AL>x#lFaYJQ@j@_|40NH1M@YFxUH$VS}FI-Nt8OZvIdqzNLUBf2h$HRV>yO9F!Y1$O<0Se7iJE+ zJ5cn4^v18jsTZ3WaQz^?ajOaHMK=Sa-)`^mf0l8ph}MfRW9k3?{z(t?Jxl*Nc^>-b W?0tkJ?BsRmpM_Q0PgprXE)4)>797+7 literal 0 HcmV?d00001 diff --git a/docs/_assets/logo.png b/docs/_assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..3f631bfe1a6c2f76f19501bfcf22f8536b5bd93b GIT binary patch literal 824 zcmV-81IPS{P)Px%@<~KNRCr$P+D&T{K^O<{nP}SLD}iWYvkY+kgq=rzSVvNR38%_j~7O_z=x&GqK#YEz4b9eVAURgoM zW@pjf&UDVgWTcKTx?~OvY40IiIJf%zsbyK?Cx(VQs%#*3ZmKP`6C+Ab|+LbkpYLa3Mc zcXjh`M1?t?Q|>_gZejJ z&yk)Kgh<=AQ6Zga>>rqs4HAN+gbzeAN;p9zqJ$Sj5=yv1B%p*J1Q^aA04q8~^=6g@!#py&(Y6Gd+jA1L~RFrt_N!hm89h({E&Ks=zB z2QmP~Ob{0ob3yu`m<{595`p}OGSTW(|5e;VWZiOfyZ&&G_WPHK>-0E;=^$N6p`8fi zAU}smp&;jz$<0!!l#R#ZJkzM$y`F7se15XFHueXUo`94iwOf4v0000v literal 0 HcmV?d00001 diff --git a/docs/_css/extra.css b/docs/_css/extra.css new file mode 100644 index 00000000..aa607b5a --- /dev/null +++ b/docs/_css/extra.css @@ -0,0 +1,27 @@ +[data-md-color-scheme="ckan"] { + --md-primary-fg-color: #2980b9; + --md-primary-fg-color--light: #ECB7B7; + --md-primary-fg-color--dark: #90030C; +} + +[data-md-color-scheme="slate"] { + --md-primary-fg-color: #2980b9; + --md-primary-fg-color--light: #ECB7B7; + --md-primary-fg-color--dark: #90030C; + --md-hue: 210; +} + + +[data-md-toggle="search"]:not(:checked) ~ .md-header .md-search__form::after { + position: absolute; + top: .3rem; + right: .3rem; + display: block; + padding: .1rem .4rem; + color: var(--md-default-bg-color); + font-weight: bold; + font-size: .8rem; + border: .05rem solid var(--md-default-bg-color--lighter); + border-radius: .1rem; + content: "/"; +} diff --git a/mkdocs.yml b/mkdocs.yml index 49cb6597..2daa443a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,20 +1,63 @@ site_name: ckanext-dcat -site_url: https://docs.ckan.org/projects/ckanext-dcat -theme: readthedocs - #theme: material - #theme: - # name: material +#site_url: https://docs.ckan.org/projects/ckanext-dcat +site_url: http://localhost:8000 + +site_description: >- + The documentation of ckanext-dcat, a CKAN extension that allows to expose + and consume metadata semantically using the DCAT standard. + + +repo_name: ckan/ckanext-dcat +repo_url: https://github.com/ckan/ckanext-dcat + +theme: + name: material + palette: + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: ckan + toggle: + icon: material/toggle-switch + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/toggle-switch-off + name: Switch to light mode + + logo: _assets/logo.png + favicon: _assets/ckan.ico + features: + - navigation.expand + - navigation.footer + - navigation.instant + - navigation.instant.progress + - search.suggest + - search.highlight + - toc.integrate + +plugins: + - search + markdown_extensions: - admonition +extra_css: + - _css/extra.css nav: - - 'index.md' - - 'getting-started.md' - - 'endpoints.md' - - DCAT CKAN mapping: 'mapping.md' - - 'profiles.md' - - 'harvester.md' - - Google Dataset Search: 'google-dataset-search.md' - - CLI: 'cli.md' - - Configuration reference: 'configuration.md' + - Home: 'index.md' + - Getting started: 'getting-started.md' + - DCAT support: + - 'endpoints.md' + - DCAT CKAN mapping: 'mapping.md' + - 'profiles.md' + - Other features: + - 'harvester.md' + - Google Dataset Search: 'google-dataset-search.md' + - CLI: 'cli.md' + - Configuration reference: 'configuration.md' + - CHANGELOG: 'https://github.com/ckan/ckanext-dcat/blob/master/CHANGELOG.md' From ecb9679f0b7a7b2b038c52d6c52f6bdbe8c474ad Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 28 Aug 2024 14:28:54 +0200 Subject: [PATCH 3/8] Install docs requirements --- .readthedocs.yaml | 14 +++----------- docs/requirements.txt | 1 + 2 files changed, 4 insertions(+), 11 deletions(-) create mode 100644 docs/requirements.txt diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a5a3ba50..35b023e7 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,22 +4,14 @@ # Required version: 2 - -# Set the version of Python and other tools you might need - build: os: ubuntu-22.04 tools: python: "3.12" - mkdocs: configuration: mkdocs.yml - -# Optionally declare the Python requirements required to build your docs - -#python: -# install: -# - requirements: docs/requirements.txt - +python: + install: + - requirements: docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..4c8f017d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +mkdocs-material From f91f92b1228858d7c2d52feec51c369d4a160507 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 29 Aug 2024 13:08:49 +0200 Subject: [PATCH 4/8] Full docs review --- README.md | 13 +- ckanext/dcat/config_declaration.yml | 7 +- docs/cli.md | 3 - docs/configuration.md | 13 +- docs/endpoints.md | 111 ++++---- docs/getting-started.md | 47 ++-- docs/google-dataset-search.md | 148 +++++----- docs/harvester.md | 27 +- docs/index.md | 113 +++++++- docs/mapping.md | 419 ++++++++++++++-------------- docs/profiles.md | 293 +++++++++---------- mkdocs.yml | 16 +- 12 files changed, 681 insertions(+), 529 deletions(-) diff --git a/README.md b/README.md index c8a595c0..b8d9cce5 100644 --- a/README.md +++ b/README.md @@ -877,7 +877,7 @@ This plugin also contains a profile to serialize a CKAN dataset to a [schema.org To define which profiles to use you can: -1. Set the `ckanext.dcat.rdf.profiles` configuration option on your CKAN configuration file: +1. Set the [`ckanext.dcat.rdf.profiles`](configuration.md#ckanextdcatrdfprofiles) configuration option on your CKAN configuration file: ckanext.dcat.rdf.profiles = euro_dcat_ap sweden_dcat_ap @@ -1166,6 +1166,15 @@ Default value: `True` Whether to expose the catalog and dataset endpoints with the RDF DCAT serializations. +#### ckanext.dcat.base_uri + +Example: + +``` +https://my-site.org/uris/ +``` + +Base URI to use when generating URIs for all entities. It needs to be a valid URI value. #### ckanext.dcat.catalog_endpoint @@ -1181,7 +1190,7 @@ Custom route for the catalog endpoint. It should start with `/` and include the `{_format}` placeholder. -#### ckanext.dcat.dataset_per_page +#### ckanext.dcat.datasets_per_page Default value: `100` diff --git a/ckanext/dcat/config_declaration.yml b/ckanext/dcat/config_declaration.yml index bdda415f..9b21384b 100644 --- a/ckanext/dcat/config_declaration.yml +++ b/ckanext/dcat/config_declaration.yml @@ -63,6 +63,11 @@ groups: serializations. type: bool + - key: ckanext.dcat.base_uri + description: | + Base URI to use when generating URIs for all entities. It needs to be a valid URI value. + example: 'https://my-site.org/uri/' + - key: ckanext.dcat.catalog_endpoint default: '/catalog.{_format}' description: | @@ -70,7 +75,7 @@ groups: `{_format}` placeholder. example: '/dcat/catalog/{_format}' - - key: ckanext.dcat.dataset_per_page + - key: ckanext.dcat.datasets_per_page default: 100 type: int description: | diff --git a/docs/cli.md b/docs/cli.md index f9585ede..46ce256c 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1,5 +1,3 @@ -## CLI - The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file: @@ -16,4 +14,3 @@ The latter form allows chaininig commands for more complex metadata processing, curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld - For the full list of options check `ckan dcat consume --help` and `ckan dcat produce --help`. - diff --git a/docs/configuration.md b/docs/configuration.md index 3c8d3741..e9032aa9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1,5 +1,3 @@ -## Configuration reference - ### General settings @@ -73,6 +71,15 @@ Default value: `True` Whether to expose the catalog and dataset endpoints with the RDF DCAT serializations. +#### ckanext.dcat.base_uri + +Example: + +``` +https://my-site.org/uris/ +``` + +Base URI to use when generating URIs for all entities. It needs to be a valid URI value. #### ckanext.dcat.catalog_endpoint @@ -88,7 +95,7 @@ Custom route for the catalog endpoint. It should start with `/` and include the `{_format}` placeholder. -#### ckanext.dcat.dataset_per_page +#### ckanext.dcat.datasets_per_page Default value: `100` diff --git a/docs/endpoints.md b/docs/endpoints.md index bfdf36d5..f3fd6585 100644 --- a/docs/endpoints.md +++ b/docs/endpoints.md @@ -1,15 +1,13 @@ # RDF DCAT endpoints -By default when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](#profiles). +By default, when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](profiles.md#profiles). -To disable the RDF endpoints, you can set the following config in your ini file: - - ckanext.dcat.enable_rdf_endpoints = False +To disable the RDF endpoints, you can set the [`ckanext.dcat.enable_rdf_endpoints`](configuration.md#ckanextdcatenable_rdf_endpoints) option in your ini file. ## Dataset endpoints -RDF representations of a particular dataset can accessed using the following endpoint: +RDF representations of a particular dataset can be accessed using the following endpoint: https://{ckan-instance-host}/dataset/{dataset-id}.{format} @@ -26,32 +24,32 @@ The fallback `rdf` format defaults to RDF/XML. Here's an example of the different formats: -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.rdf -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.ttl -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.n3 -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld - -RDF representations will be advertised using `` tags on the `` sectionon the dataset page source code, eg: +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.rdf](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.rdf) +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml) +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.ttl](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.ttl) +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.n3](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.n3) +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld) - +RDF representations will be advertised using `` tags on the `` section of the dataset page source code, e.g.: - - - +```html + - + + + + +``` -Check the [RDF DCAT Serializer](#rdf-dcat-serializer) section for more details about how these are generated and how to customize the output using [profiles](#profiles). +Check the [RDF DCAT Serializer](profiles.md#rdf-dcat-serializer) section for more details about how these are generated and how to customize the output using [profiles](profiles.md#profiles). You can specify the profile by using the `profiles=,` query parameter on the dataset endpoint (as a comma-separated list): -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml?profiles=euro_dcat_ap -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld?profiles=schemaorg +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml?profiles=euro_dcat_ap](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml?profiles=euro_dcat_ap) +* [https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld?profiles=schemaorg](https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld?profiles=schemaorg) -*Note*: When using this plugin, the above endpoints will replace the old deprecated ones that were part of CKAN core. ## Catalog endpoint @@ -60,7 +58,7 @@ Additionally to the individual dataset representations, the extension also offer https://{ckan-instance-host}/catalog.{format}?[page={page}]&[modified_since={date}]&[profiles={profile1},{profile2}]&[q={query}]&[fq={filter query}] -This endpoint can be customized if necessary using the `ckanext.dcat.catalog_endpoint` configuration option, eg: +This endpoint base path can be customized if necessary using the [`ckanext.dcat.catalog_endpoint`](configuration.md#ckanextdcatcatalog_endpoint) configuration option, eg: ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} @@ -72,44 +70,47 @@ As described previously, the extension will determine the RDF serialization form * http://demo.ckan.org/catalog.xml * http://demo.ckan.org/catalog.ttl -RDF representations will be advertised using `` tags on the `` sectionon the homepage and the dataset search page source code, eg: +RDF representations will be advertised using `` tags on the `` section of the catalog homepage and the dataset search page source code, eg: - +```html + + + + + - - - - + +``` - +The number of datasets returned is limited. The response will include paging info, serialized using the [Hydra](http://www.w3.org/ns/hydra/spec/latest/core/) vocabulary. The different properties are self-explanatory, and can be used by clients to iterate the catalog: -The number of datasets returned is limited. The response will include paging info, serialized using the [Hydra](http://www.w3.org/ns/hydra/spec/latest/core/) vocabulary. The different terms are self-explanatory, and can be used by clients to iterate the catalog: +```turtle +@prefix hydra: . - @prefix hydra: . + a hydra:PagedCollection ; + hydra:first "http://example.com/catalog.ttl?page=1" ; + hydra:last "http://example.com/catalog.ttl?page=3" ; + hydra:next "http://example.com/catalog.ttl?page=2" ; + hydra:totalItems 283 . +``` - a hydra:PagedCollection ; - hydra:first "http://example.com/catalog.ttl?page=1" ; - hydra:last "http://example.com/catalog.ttl?page=3" ; - hydra:next "http://example.com/catalog.ttl?page=2" ; - hydra:totalItems 283 . +The default number of datasets returned (100) can be modified by CKAN site maintainers using [`ckanext.dcat.datasets_per_page`](configuration.md#ckanextdcatdatasets_per_page) -The default number of datasets returned (100) can be modified by CKAN site maintainers using the following configuration option on your ini file: +The catalog endpoint also supports a `modified_since` parameter to restrict datasets to those modified from a certain date. The parameter value should be a valid ISO-8601 date: - ckanext.dcat.datasets_per_page = 20 + http://demo.ckan.org/catalog.xml?modified_since=2015-07-24 -The catalog endpoint also supports a `modified_since` parameter to restrict datasets to those modified from a certain date. The parameter value should be a valid ISO-8601 date: +It is possible to specify the profile(s) to use for the serialization using the `profiles` parameter: -http://demo.ckan.org/catalog.xml?modified_since=2015-07-24 + http://demo.ckan.org/catalog.xml?profiles=euro_dcat_ap,sweden_dcat_ap -It's possible to specify the profile(s) to use for the serialization using the `profiles` parameter: +To filter the output, the catalog endpoint supports the `q` and `fq` parameters to specify a [search query](https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#q-parameter) or [filter query](https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#fq-filter-query-parameter): -http://demo.ckan.org/catalog.xml?profiles=euro_dcat_ap,sweden_dcat_ap -To filter the output, the catalog endpoint supports the `q` and `fq` parameters to specify a [search query](https://lucene.apache.org/solr/guide/6_6/the-dismax-query-parser.html#TheDisMaxQueryParser-TheqParameter) or [filter query](https://lucene.apache.org/solr/guide/6_6/common-query-parameters.html#CommonQueryParameters-Thefq_FilterQuery_Parameter): -http://demo.ckan.org/catalog.xml?q=budget -http://demo.ckan.org/catalog.xml?fq=tags:economy + http://demo.ckan.org/catalog.xml?q=budget + http://demo.ckan.org/catalog.xml?fq=tags:economy @@ -118,8 +119,8 @@ http://demo.ckan.org/catalog.xml?fq=tags:economy Whenever possible, URIs are generated for the relevant entities. To try to generate them, the extension will use the first found of the following for each entity: * Catalog: - - `ckanext.dcat.base_uri` configuration option value. This is the recommended approach. Value should be a valid URI - - `ckan.site_url` configuration option value. + - [`ckanext.dcat.base_uri`](configuration.md#ckanextdcatbase_uri) configuration option value. This is the recommended approach. Value should be a valid URI. + - [`ckan.site_url`](https://docs.ckan.org/en/latest/maintaining/configuration.html#ckan-site-url) configuration option value. - 'http://' + `app_instance_uuid` configuration option value. This is not recommended, and a warning log message will be shown. * Dataset: @@ -131,12 +132,18 @@ Whenever possible, URIs are generated for the relevant entities. To try to gener - The value of the `uri` field (note that this is not included in the default CKAN schema) - Catalog URI (see above) + '/dataset/' + `package_id` field + '/resource/ + `id` field -Note that if you are using the [RDF DCAT harvester](#rdf-dcat-harvester) to import datasets from other catalogs and these define a proper URI for each dataset or resource, these will be stored as `uri` fields in your instance, and thus used when generating serializations for them. +Note that if you are using the [RDF DCAT harvester](harvester.md) to import datasets from other catalogs and these define a proper URI for each dataset or resource, these will be stored as `uri` fields in your instance, and so used when generating serializations for them. ## Content negotiation -The extension supports returning different representations of the datasets based on the value of the `Accept` header ([Content negotiation](https://en.wikipedia.org/wiki/Content_negotiation)). +The extension supports returning different representations of the datasets based on the value of the `Accept` header ([Content negotiation](https://en.wikipedia.org/wiki/Content_negotiation)). This is turned off by default, to enable it, set [`ckanext.dcat.enable_content_negotiation`](configuration.md#ckanextdcatenable_content_negotiation). + +!!! Note + + This feature overrides the CKAN core home page and dataset page view routes, + so you probably don't want to enable it if your own extension is also doing it. + When enabled, client applications can request a particular format via the `Accept` header on requests to the main dataset page, eg: @@ -147,9 +154,3 @@ When enabled, client applications can request a particular format via the `Accep This is also supported on the [catalog endpoint](#catalog-endpoint), in this case when making a request to the CKAN root URL (home page). This won't support the pagination and filter parameters: curl https://{ckan-instance-host} -H Accept:text/turtle - -Note that this feature overrides the CKAN core home page and dataset page controllers, so you probably don't want to enable it if your own extension is also doing it. - -To enable content negotiation, set the following configuration option on your ini file: - - ckanext.dcat.enable_content_negotiation = True diff --git a/docs/getting-started.md b/docs/getting-started.md index cbe6e134..5310b4ad 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -3,7 +3,7 @@ ## Installation -1. Install the extension on your virtualenv: +1. Install the extension in your virtualenv: (pyenv) $ pip install -e git+https://github.com/ckan/ckanext-dcat.git#egg=ckanext-dcat @@ -21,44 +21,51 @@ Check the [Schemas](#schemas) section for extra configuration needed. -Optionally, if you want to use the RDF harvester, install ckanext-harvest as well ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)). +Optionally, if you want to use the RDF harvester, install [ckanext-harvest](https://github.com/ckan/ckanext-harvest#installation) as well. -For further configuration options available, see [Configuration reference](#configuration-reference). +For further configuration options available, see [Configuration reference](configuration.md). ## Schemas -The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas that enable DCAT support. These include a schema definition file (located in `ckanext/dcat/schemas`) plus extra validators and other custom logic that integrates the metadata modifications with the RDF DCAT [Parsers](#rdf-dcat-parser) and [Serializers](#rdf-dcat-serializer) and other CKAN features and extensions. +The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas +that enable DCAT support. These include a schema definition file (located +in [`ckanext/dcat/schemas`](https://github.com/ckan/ckanext-dcat/tree/master/ckanext/dcat/schemas)) +plus extra validators and other custom logic that integrates the metadata modifications with the +RDF DCAT [Parsers](profiles.md#rdf-dcat-parser) and [Serializers](profiles.md#rdf-dcat-serializer) and other CKAN features and extensions. There are the following schemas currently included with the extension: -* *dcat_ap_2.1_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. -* *dcat_ap_2.1_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) specification. +* *dcat_ap_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the DCAT AP specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. +* *dcat_ap_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT AP v2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) and [DCAT AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/) specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. -Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. +Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](profiles.md#profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. In any case, the schema file used should be defined in the configuration file, alongside these configuration options: +```ini +# Make sure to add scheming_datasets after the dcat plugin +ckan.plugins = activity dcat [...] scheming_datasets - # Make sure to add scheming_datasets after the dcat plugin - ckan.plugins = activity dcat [...] scheming_datasets +# Point to one of the defaults or your own version of the schema file +scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_recommended.yaml - # Point to one of the defaults or your own version of the schema file - scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml +# Include the dcat presets as well as the standard scheming ones +scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml - # Include the dcat presets as well as the standard scheming ones - scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml - - # Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the - # euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) - ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming +# Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the +# euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) +ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming +``` ### Compatibility with existing profiles Sites using the existing `euro_dcat_ap` and `euro_dcat_ap_2` profiles should not see any change in their -current parsing and serialization functionalities and these profiles will not change their outputs going -forward (unless a bug is being fixed). Sites willing to migrate to a scheming based metadata schema can do +current parsing, and serialization functionalities and these profiles will not change their outputs going +forward (unless a bug is being fixed). + +Sites willing to migrate to a ckanext-scheming based metadata schema can do so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. `ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile -outputs to the expected format by the scheming validators. +outputs to the format expected by the scheming validators. Note that the scheming profile will only affect fields defined in the schema definition file, so sites can start migrating gradually different metadata fields. diff --git a/docs/google-dataset-search.md b/docs/google-dataset-search.md index 78c20554..9710e45b 100644 --- a/docs/google-dataset-search.md +++ b/docs/google-dataset-search.md @@ -1,12 +1,10 @@ ## Structured data and Google Dataset Search indexing -There are plugins available to add [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) to dataset pages to provide richer metadata for search engines crawling your site. One of the most well known is [Google Dataset Search](https://toolbox.google.com/datasetsearch). The `structured_data` plugin will add the necessary markup in order to get your datasets indexed by Google Dataset Search. This markup is a JSON-LD snippet that uses the [schema.org](https://schema.org) vocabulary to describe the dataset. +The `structured_data` plugin will add the necessary markup to dataset pages in order to get your datasets indexed by [Google Dataset Search](https://toolbox.google.com/datasetsearch). This markup is a [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) JSON-LD snippet that uses the [schema.org](https://schema.org) vocabulary to describe the dataset. -To add [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) to dataset pages, activate the `structured_data` and `dcat` plugins in your ini file: + ckan.plugins = dcat structured_data - ckan.plugins = dcat structured_data - -By default this uses the `schemaorg` profile (see [profiles](#profiles)) to serialize the dataset to JSON-LD, which is then added to the dataset detail page. +By default this uses the `schemaorg` profile (see [Profiles](profiles.md#profiles)) to serialize the dataset to JSON-LD, which is then added to the dataset detail page. To change the schema, you have to override the Jinja template block called `structured_data` in [`templates/package/read_base.html`](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/templates/package/read_base.html) and call the template helper function with different parameters: {% block structured_data %} @@ -17,77 +15,77 @@ To change the schema, you have to override the Jinja template block called `stru Example output of structured data in JSON-LD: - < ... > - - - - + }, + { + "@id": "_:Nb9677036512840e1a00c9fec2818abe4", + "@type": "schema:ContactPoint", + "schema:contactType": "customer service", + "schema:email": "contact@example.com", + "schema:name": "Public Transport Support", + "schema:url": "https://public-transport.example.com" + } + ] + } + + + +``` diff --git a/docs/harvester.md b/docs/harvester.md index 1d9e2a9b..0a0b2aa2 100644 --- a/docs/harvester.md +++ b/docs/harvester.md @@ -1,12 +1,12 @@ ## RDF DCAT harvester -The RDF parser described in the previous section has been integrated into a harvester, -to allow automatic import of datasets from remote sources. To enable the RDF harvester, add the `dcat_rdf_harvester` plugin to your CKAN configuration file: +The [RDF parser](profiles.md#rdf-dcat-parser) described in the previous section has been integrated into a harvester, +to allow automatic import of datasets from remote sources. To enable the RDF harvester, add the `harvest` and `dcat_rdf_harvester` plugins to your CKAN configuration file (you will also need to install [ckanext-harvest](https://github.com/ckan/ckanext-harvest)): - ckan.plugins = ... dcat_rdf_harvester + ckan.plugins = ... harvest dcat_rdf_harvester The harvester will download the remote file, extract all datasets using the parser and create or update actual CKAN datasets based on that. -It will also handle deletions, ie if a dataset is not present any more in the DCAT dump anymore it will get deleted from CKAN. +It will also handle deletions, i.e. if a dataset is not present any more in the DCAT dump anymore it will get deleted from CKAN. The harvester will look at the `content-type` HTTP header field to determine the used RDF format. Any format understood by the [RDFLib](https://rdflib.readthedocs.org/en/stable/plugin_parsers.html) library can be parsed. It is possible to override this functionality and provide a specific format using the harvester configuration. This is useful when the server does not return the correct `content-type` or when harvesting a file on the local file system without a proper extension. The harvester configuration is a JSON object that you fill into the harvester configuration form field. @@ -16,19 +16,17 @@ The harvester will look at the `content-type` HTTP header field to determine the ### Maximum file size -The default max size of the file (for each HTTP response) to harvest is actually 50 MB. The size can be customised by setting the configuration option `ckanext.dcat.max_file_size` to your CKAN configuration file. -Here‘s an example of setting the max file size to 100 MB: - -`ckanext.dcat.max_file_size = 100` +The default max size of the file (for each HTTP response) to harvest is actually 50 MB. The size can be customised by setting the configuration option [`ckanext.dcat.max_file_size`](configuration.md#ckanextdcatmax_file_size) in your CKAN configuration file. ### Transitive harvesting In transitive harvesting (i.e., when you harvest a catalog A, and a catalog X harvests your catalog), you may want to provide the original catalog info for each harvested dataset. -By setting the configuration option `ckanext.dcat.expose_subcatalogs = True` in your ini file, you'll enable the storing and publication of the source catalog for each harvested dataset. +By setting the configuration option [`ckanext.dcat.expose_subcatalogs`](configuration.md#ckanextdcatexpose_subcatalogs) to true in your ini file, you'll enable the storing and publication of the source catalog for each harvested dataset. The information contained in the harvested `dcat:Catalog` node will be stored as extras into the harvested datasets. When serializing, your Catalog will expose the harvested Catalog using the `dct:hasPart` relation. This means that your catalog will have this structure: + - `dcat:Catalog` (represents your current catalog) - `dcat:dataset` (1..n, the dataset created withing your catalog) - `dct:hasPart` @@ -41,7 +39,7 @@ When serializing, your Catalog will expose the harvested Catalog using the `dct: ### Extending the RDF harvester -The DCAT RDF harvester has extension points that allow to modify its behaviour from other extensions. These can be used by extensions implementing +The DCAT RDF harvester has plugin hooks that allow to modify its behaviour from other extensions. These can be used by extensions implementing the `IDCATRDFHarvester` interface. Right now it provides the following methods: * `before_download` and `after_download`: called just before and after retrieving the remote file, and can be used for instance to validate the contents. @@ -52,7 +50,10 @@ the `IDCATRDFHarvester` interface. Right now it provides the following methods: To know more about these methods, please check the source of [`ckanext-dcat/ckanext/dcat/interfaces.py`](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/interfaces.py). -## JSON DCAT harvester +## JSON DCAT harvester (deprecated) + +!!! Warning + The DCAT JSON harvester is not maintained and will be removed in future versions The DCAT JSON harvester supports importing JSON objects that are based on DCAT terms but are not defined as JSON-LD. The exact format for these JSON files is the one described in the [spec.dataportals.org](http://spec.dataportals.org/#datasets-serialization-format) site. There are [example files](https://github.com/ckan/ckanext-dcat/blob/master/examples/dataset.json) in the `examples` folder. @@ -60,7 +61,3 @@ is the one described in the [spec.dataportals.org](http://spec.dataportals.org/# To enable the JSON harvester, add the `dcat_json_harvester` plugin to your CKAN configuration file: ckan.plugins = ... dcat_json_harvester - -*TODO*: align the fields created by this harvester with the base mapping (ie the ones created by the RDF harvester). - - diff --git a/docs/index.md b/docs/index.md index aaf6ce03..ff7c7e3d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,16 +5,117 @@ [![Code Coverage](http://codecov.io/github/ckan/ckanext-dcat/coverage.svg?branch=master)](http://codecov.io/github/ckan/ckanext-dcat?branch=master) -This extension provides plugins that allow CKAN to expose its metadata and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: +Ckanext-dcat is a [CKAN](https://github.com/ckan/ckan) extension that helps data publishers expose and consume metadata as serialized RDF documents using [DCAT](https://github.com/ckan/ckan). -[http://www.w3.org/TR/vocab-dcat](http://www.w3.org/TR/vocab-dcat) -It also offers other features related to Semantic Data like exposing the necessary markup to get your datasets indexed in [Google Dataset Search](https://toolbox.google.com/datasetsearch). +=== "CKAN dataset" -!!! todo - Overview with standards supported and high level concepts + ``` json + { + "id": "425e361b-bad9-4a8f-8cc4-2e147c4e8c18", + "name": "my-ckan-dataset", + "title": "An example CKAN dataset", + "description": "Some notes about the data", + "temporal_coverage": [ + { + "start": "2024-01-01", + "end": "2024-12-31" + } + ], + "resources": [ + { + "id": "df0fc449-fddf-41af-910a-f972b458956c", + "name": "Some data in CSV format", + "url": "http://my-ckan-site.org/dataset/425e361b-bad9-4a8f-8cc4-2e147c4e8c18/resource/df0fc449-fddf-41af-910a-f972b458956c/download/data.csv", + "format": "CSV" + } + ] + } + ``` +=== "DCAT representation (Turtle)" -Check the [overview](#overview) section for a summary of the available features. + ```turtle + @prefix dcat: . + @prefix dct: . + @prefix xsd: . + a dcat:Dataset ; + dct:identifier "425e361b-bad9-4a8f-8cc4-2e147c4e8c18" ; + dct:temporal [ a dct:PeriodOfTime ; + dcat:endDate "2024-12-31"^^xsd:date ; + dcat:startDate "2024-01-01"^^xsd:date ] ; + dct:title "An example CKAN dataset" ; + dcat:distribution . + a dcat:Distribution ; + dct:format "CSV" ; + dct:title "Some data in CSV format" ; + dcat:accessURL . + ``` + +=== "DCAT representation (JSON-LD)" + + ``` json + { + "@context": { + "dcat": "http://www.w3.org/ns/dcat#", + "dct": "http://purl.org/dc/terms/", + "xsd": "http://www.w3.org/2001/XMLSchema#" + }, + "@graph": [ + { + "@id": "http://my-ckan-site.org/dataset/425e361b-bad9-4a8f-8cc4-2e147c4e8c18", + "@type": "dcat:Dataset", + "dcat:distribution": { + "@id": "http://my-ckan-site.org/dataset/425e361b-bad9-4a8f-8cc4-2e147c4e8c18/resource/df0fc449-fddf-41af-910a-f972b458956c" + }, + "dct:identifier": "425e361b-bad9-4a8f-8cc4-2e147c4e8c18", + "dct:temporal": { + "@id": "_:N1c32ba52ad1641d086101a4a4bcbe8a5" + }, + "dct:title": "An example CKAN dataset" + }, + { + "@id": "_:N1c32ba52ad1641d086101a4a4bcbe8a5", + "@type": "dct:PeriodOfTime", + "dcat:endDate": { + "@type": "xsd:date", + "@value": "2024-12-31" + }, + "dcat:startDate": { + "@type": "xsd:date", + "@value": "2024-01-01" + } + }, + { + "@id": "http://my-ckan-site.org/dataset/425e361b-bad9-4a8f-8cc4-2e147c4e8c18/resource/df0fc449-fddf-41af-910a-f972b458956c", + "@type": "dcat:Distribution", + "dcat:accessURL": { + "@id": "http://my-ckan-site.org/dataset/425e361b-bad9-4a8f-8cc4-2e147c4e8c18/resource/df0fc449-fddf-41af-910a-f972b458956c/download/data.csv" + }, + "dct:format": "CSV", + "dct:title": "Some data in CSV format" + } + ] + } + ``` + +In terms of CKAN features, this extension offers: + +* [Pre-built CKAN schemas](getting-started.md#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the -box DCAT support in data portals. + +* [DCAT Endpoints](endpoints.md) that expose the catalog datasets in different RDF serializations (`dcat` plugin). + +* An [RDF Harvester](harvester.md) that allows importing RDF serializations from other catalogs to create CKAN datasets (`dcat_rdf_harvester` plugin). + +* Other features like [Command Line Interface](cli.md) or support for indexing in [Google Dataset Search](google-dataset-search.md). + + +These are implemented internally using: + +* A base [mapping](mapping.md) between DCAT and CKAN datasets and viceversa (compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210)). + +* An [RDF Parser](profiles.md#rdf-dcat-parser) that allows to read RDF serializations in different formats and extract CKAN dataset dicts, using customizable [profiles](profiles.md#profiles). + +* An [RDF Serializer](profiles.md#rdf-dcat-serializer) that allows to transform CKAN datasets metadata to different semantic formats, also allowing customizable [profiles](profiles.md#profiles). diff --git a/docs/mapping.md b/docs/mapping.md index 86b671c0..823635a3 100644 --- a/docs/mapping.md +++ b/docs/mapping.md @@ -4,53 +4,60 @@ The following table provides a generic mapping between the fields of the `dcat:D their equivalents in the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link the DCAT publisher property with a CKAN dataset author, maintainer or organization, as the link between them is not straight-forward and may depend on a particular instance needs. When mapping from CKAN metadata to DCAT though, there are in some cases fallback fields -that are used if the default field is not present (see [RDF Serializer](#rdf-dcat-serializer) for more details on this. +that are used if the default field is not present (see [RDF Serializer](profiles.md#rdf-dcat-serializer) for more details on this). -This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). It depends on the active profile(s) (see [Profiles](#profiles)) which DCAT properties are mapped. +This mapping is compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210) and [DCAT-AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/). It depends on the active profile(s) (see [Profiles](profiles.md#profiles)) which DCAT properties are mapped. -Sites are encouraged to use ckanext-scheming to manage their metadata schema (see [Schemas](#schemas) for all details). This changes in +Sites are encouraged to use ckanext-scheming to manage their metadata schema (see [Schemas](getting-started.md#schemas) for all details). This changes in some cases the way metadata is stored internally and presented at the CKAN API level, but should not affect the RDF DCAT output. +!!! Note + Fields prefixed with `custom:` are custom metadata fields defined via ckanext-scheming. When using `euro_dcat_ap` + and `euro_dcat_ap_2` based profiles, these could also be actual extra fields (e.g. `extras=[{"key": "issued", "value": "2024"}]`). + It is recommended that site maintainers start to migrate to custom fields by using the `euro_dcat_ap_scheming` profile as this + fields are properly validated, can use the scheming snippets etc. See [Schemas](getting-started.md#schemas) for more details. + + | DCAT class | DCAT property | CKAN dataset field | CKAN fallback fields | Stored as | | |-------------------|------------------------|-------------------------------------------|--------------------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| dcat:Dataset | - | extra:uri | | text | See [URIs](#uris-1) | +| dcat:Dataset | - | custom:uri | | text | See [URIs](mapping.md#uris) | | dcat:Dataset | dct:title | title | | text | | | dcat:Dataset | dct:description | notes | | text | | | dcat:Dataset | dcat:keyword | tags | | text | | -| dcat:Dataset | dcat:theme | extra:theme | | list | See [Lists](#lists) | -| dcat:Dataset | dct:identifier | extra:identifier | extra:guid, id | text | | -| dcat:Dataset | adms:identifier | extra:alternate_identifier | | text | | -| dcat:Dataset | dct:issued | extra:issued | metadata_created | text | | -| dcat:Dataset | dct:modified | extra:modified | metadata_modified | text | | -| dcat:Dataset | owl:versionInfo | version | extra:dcat_version | text | | -| dcat:Dataset | adms:versionNotes | extra:version_notes | | text | | -| dcat:Dataset | dct:language | extra:language | | list | See [Lists](#lists) | +| dcat:Dataset | dcat:theme | custom:theme | | list | See [Lists](#lists) | +| dcat:Dataset | dct:identifier | custom:identifier | custom:guid, id | text | | +| dcat:Dataset | adms:identifier | custom:alternate_identifier | | text | | +| dcat:Dataset | dct:issued | custom:issued | metadata_created | text | | +| dcat:Dataset | dct:modified | custom:modified | metadata_modified | text | | +| dcat:Dataset | owl:versionInfo | version | custom:dcat_version | text | | +| dcat:Dataset | adms:versionNotes | custom:version_notes | | text | | +| dcat:Dataset | dct:language | custom:language | | list | See [Lists](#lists) | | dcat:Dataset | dcat:landingPage | url | | text | | -| dcat:Dataset | dct:accrualPeriodicity | extra:frequency | | text | | -| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See [Lists](#lists) | -| dcat:Dataset | dct:accessRights | extra:access_rights | | text | | -| dcat:Dataset | foaf:page | extra:documentation | | list | See [Lists](#lists) | -| dcat:Dataset | dct:provenance | extra:provenance | | text | | -| dcat:Dataset | dct:type | extra:dcat_type | | text | As of DCAT-AP v1.1 there's no controlled vocabulary for this field | -| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:source | extra:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | adms:sample | extra:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | -| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See [Spatial coverage](#spatial-coverage) | -| dcat:Dataset | dct:temporal | extra:temporal_start + extra:temporal_end | | text | None, one or both extras can be present | -| dcat:Dataset | dcat:temporalResolution| extra:temporal_resolution | | list | | -| dcat:Dataset | dcat:spatialResolutionInMeters| extra:spatial_resolution_in_meters | | list | | -| dcat:Dataset | dct:isReferencedBy | extra:is_referenced_by | | list | | -| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See [URIs](#uris-1) and [Publisher](#contact-points-and-publisher) | -| foaf:Agent | foaf:name | extra:publisher_name | | text | | -| foaf:Agent | foaf:mbox | extra:publisher_email | organization:title | text | | -| foaf:Agent | foaf:homepage | extra:publisher_url | | text | | -| foaf:Agent | dct:type | extra:publisher_type | | text | | -| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See [URIs](#uris-1) and [Contact points](#contact-points-and-publisher) | -| vcard:Kind | vcard:fn | extra:contact_name | maintainer, author | text | | -| vcard:Kind | vcard:hasEmail | extra:contact_email | maintainer_email, author_email | text | | +| dcat:Dataset | dct:accrualPeriodicity | custom:frequency | | text | | +| dcat:Dataset | dct:conformsTo | custom:conforms_to | | list | See [Lists](#lists) | +| dcat:Dataset | dct:accessRights | custom:access_rights | | text | | +| dcat:Dataset | foaf:page | custom:documentation | | list | See [Lists](#lists) | +| dcat:Dataset | dct:provenance | custom:provenance | | text | | +| dcat:Dataset | dct:type | custom:dcat_type | | text | | +| dcat:Dataset | dct:hasVersion | custom:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:isVersionOf | custom:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | dct:source | custom:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | +| dcat:Dataset | adms:sample | custom:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | +| dcat:Dataset | dct:spatial | custom:spatial_uri | | text | See [Spatial coverage](#spatial-coverage) | +| dcat:Dataset | dct:temporal | custom:temporal_start + custom:temporal_end | | text | None, one or both extras can be present | +| dcat:Dataset | dcat:temporalResolution| custom:temporal_resolution | | list | | +| dcat:Dataset | dcat:spatialResolutionInMeters| custom:spatial_resolution_in_meters | | list | | +| dcat:Dataset | dct:isReferencedBy | custom:is_referenced_by | | list | | +| dcat:Dataset | dct:publisher | custom:publisher_uri | | text | See [URIs](mapping.md#uris) and [Publisher](#contact-points-and-publisher) | +| foaf:Agent | foaf:name | custom:publisher_name | | text | | +| foaf:Agent | foaf:mbox | custom:publisher_email | organization:title | text | | +| foaf:Agent | foaf:homepage | custom:publisher_url | | text | | +| foaf:Agent | dct:type | custom:publisher_type | | text | | +| dcat:Dataset | dcat:contactPoint | custom:contact_uri | | text | See [URIs](mapping.md#uris) and [Contact points](#contact-points-and-publisher) | +| vcard:Kind | vcard:fn | custom:contact_name | maintainer, author | text | | +| vcard:Kind | vcard:hasEmail | custom:contact_email | maintainer_email, author_email | text | | | dcat:Dataset | dcat:distribution | resources | | text | | -| dcat:Distribution | - | resource:uri | | text | See [URIs](#uris-1) | +| dcat:Distribution | - | resource:uri | | text | See [URIs](mapping.md#uris) | | dcat:Distribution | dct:title | resource:name | | text | | | dcat:Distribution | dcat:accessURL | resource:access_url | resource:url | text | If downloadURL is not present, accessURL will be used as resource url | | dcat:Distribution | dcat:downloadURL | resource:download_url | | text | If present, downloadURL will be used as resource url | @@ -81,138 +88,136 @@ some cases the way metadata is stored internally and presented at the CKAN API l | spdx:Checksum | spdx:checksumValue | resource:hash | | text | | | spdx:Checksum | spdx:algorithm | resource:hash_algorithm | | text | | -*Notes* - ### Custom fields -Fields marked as `extra:` are stored as free form extras in the `euro_dcat_ap` and `euro_dcat_ap_2` profiles, +Fields marked as `custom:` are stored as free form extras in the `euro_dcat_ap` and `euro_dcat_ap_2` profiles, but stored as first level custom fields when using the scheming based profile (`euro_dcat_ap_scheming`), i.e: - ```json - { - "name": "test_dataset_dcat", - "extras": [ - {"key": "version_notes", "value": "Some version notes"} - ] - } - ``` +```json +{ + "name": "test_dataset_dcat", + "extras": [ + {"key": "version_notes", "value": "Some version notes"} + ] +} +``` vs: - ```json - { - "name": "test_dataset_dcat", - "version_notes": "Some version notes" - } - ``` +```json +{ + "name": "test_dataset_dcat", + "version_notes": "Some version notes" +} +``` ### URIs Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. For instance: - ```xml - - - - - Dataset 1 - - - Publishing Organization for dataset 1 - - - - - - ``` - - ```json - { - "title": "Dataset 1", - "extras": [ - {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"}, - {"key": "publisher_uri", "value": "http://orgs.vocab.org/some-org"}, - {"key": "publisher_name", "value": "Publishing Organization for dataset 1"} - ] - } - ``` - - Another example: - - ``` - @prefix dcat: . - @prefix dct: . - @prefix rdf: . - - - a dcat:Dataset ; - dct:title "Dataset 1" ; - dcat:distribution - . - - - - a dcat:Distribution ; - dct:title "Distribution for dataset 1" ; - dcat:accessURL . - ``` - - ```json - { - "title": "Dataset 1", - "extras": [ - {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} - ], - "resources": [{ - "name": "Distribution for dataset 1", - "url": "http://data.some.org/catalog/datasets/1/downloads/1.csv", - "uri": "http://data.some.org/catalog/datasets/1/d/1" - }] - } - ``` +```xml + + + + + Dataset 1 + + + Publishing Organization for dataset 1 + + + + + +``` + +```json +{ + "title": "Dataset 1", + "extras": [ + {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"}, + {"key": "publisher_uri", "value": "http://orgs.vocab.org/some-org"}, + {"key": "publisher_name", "value": "Publishing Organization for dataset 1"} + ] +} +``` + +Another example: + +```turtle +@prefix dcat: . +@prefix dct: . +@prefix rdf: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dcat:distribution + . + + + + a dcat:Distribution ; + dct:title "Distribution for dataset 1" ; + dcat:accessURL . +``` + +```json +{ + "title": "Dataset 1", + "extras": [ + {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} + ], + "resources": [{ + "name": "Distribution for dataset 1", + "url": "http://data.some.org/catalog/datasets/1/downloads/1.csv", + "uri": "http://data.some.org/catalog/datasets/1/d/1" + }] +} +``` ### Lists On the legacy profiles, lists are stored as a JSON string, eg: - ``` - @prefix dcat: . - @prefix dct: . - @prefix rdf: . - - - a dcat:Dataset ; - dct:title "Dataset 1" ; - dct:language "ca" , "en" , "es" ; - dcat:theme "http://eurovoc.europa.eu/100142" , "http://eurovoc.europa.eu/209065", "Earth Sciences" ; - ``` - - ```json - { - "title": "Dataset 1", - "extras": [ - {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} - {"key": "language", "value": "[\"ca\", \"en\", \"es\"]"} - {"key": "theme", "value": "[\"Earth Sciences\", \"http://eurovoc.europa.eu/209065\", \"http://eurovoc.europa.eu/100142\"]"} - ], - } - ``` +```turtle +@prefix dcat: . +@prefix dct: . +@prefix rdf: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:language "ca" , "en" , "es" ; + dcat:theme "http://eurovoc.europa.eu/100142" , "http://eurovoc.europa.eu/209065", "Earth Sciences" ; +``` + +```json +{ + "title": "Dataset 1", + "extras": [ + {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"}, + {"key": "language", "value": "[\"ca\", \"en\", \"es\"]"}, + {"key": "theme", "value": "[\"Earth Sciences\", \"http://eurovoc.europa.eu/209065\", \"http://eurovoc.europa.eu/100142\"]"} + ] +} +``` On the scheming-based ones, these are shown as actual lists: - ```json - { - "title": "Dataset 1", - "uri": "http://data.some.org/catalog/datasets/1"}, - "language": ["ca", "en", "es"] - "theme": ["Earth Sciences", "http://eurovoc.europa.eu/209065", "http://eurovoc.europa.eu/100142"] - } - ``` +```json +{ + "title": "Dataset 1", + "uri": "http://data.some.org/catalog/datasets/1"}, + "language": ["ca", "en", "es"], + "theme": ["Earth Sciences", "http://eurovoc.europa.eu/209065", "http://eurovoc.europa.eu/100142"] +} +``` ### Contact points and Publisher Properties for `dcat:contactPoint` and `dct:publisher` are stored as namespaced extras in the legacy profiles. When using @@ -254,56 +259,58 @@ If no `publisher` or `publisher_*` fields are found, the serializers will fall b ### Spatial coverage -The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. - - - One `dct:spatial` instance, URI only - - ```xml - - ``` - - - One `dct:spatial` instance with text (this should not be used anyway) - - ```xml - Newark - ``` - - - One `dct:spatial` instance with label and/or geometry - - ```xml - - - - {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} - - - POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) - - Newark - - - ``` - - - Multiple `dct:spatial` instances (as in GeoDCAT-AP) - - ```xml - - - - - {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} - - - POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) - - - - - - Newark - - - ``` +The following formats for `dct:spatial` are supported by the default [parser](profiles.md#rdf-dcat-parser). Note that the default [serializer](profiles.md#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. +## RDF DCAT Parser + + - One `dct:spatial` instance, URI only + + ```xml + + ``` + + - One `dct:spatial` instance with text (this should not be used anyway) + + ```xml + Newark + ``` + + - One `dct:spatial` instance with label and/or geometry + + ```xml + + + + {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} + + + POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) + + Newark + + + ``` + + - Multiple `dct:spatial` instances (as in GeoDCAT-AP) + + ```xml + + + + + {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} + + + POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) + + + + + + Newark + + + ``` + If the RDF provides them, profiles should store the textual and geometric representation of the location in: * For legacy profiles in `spatial_text`, `spatial_bbox`, `spatial_centroid` or `spatial` (for any other geometries) extra fields @@ -342,12 +349,16 @@ If the RDF provides them, profiles should store the textual and geometric repres ### Licenses -On the CKAN model, license is at the dataset level whereas in DCAT model it - is at distributions level. By default the RDF parser will try to find a - distribution with a license that matches one of those registered in CKAN - and attach this license to the dataset. The first matching distribution's - license is used, meaning that any discrepancy accross distributions license - will not be accounted for. This behavior can be customized by overridding the - `_license` method on a custom profile. +In the CKAN model, the license field is stored at the dataset level whereas in the DCAT model it +is stored at Distributions level. By default, the RDF parser will try to find a +distribution with a license that matches one of those registered in CKAN +and attach this license to the dataset. The first matching distribution's +license is used, meaning that any discrepancy accross distributions license +will not be accounted for. This behavior can be customized by overridding the +`_license()` method on a custom profile. + +When serializing, distributions can inherit the license from the dataset +if [`ckanext.dcat.resource.inherit.license`](configuration.md#ckanextdcatresourceinheritlicense) is set to true. + diff --git a/docs/profiles.md b/docs/profiles.md index dfdadb7c..37deae25 100644 --- a/docs/profiles.md +++ b/docs/profiles.md @@ -1,236 +1,243 @@ -## RDF DCAT Parser +## Profiles -The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different -formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions -and create CKAN datasets and resources, as dictionaries that can be passed to [`package_create`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.create.package_create) or [`package_update`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.update.package_update). +Both the parsers and the serializers use profile classes to allow customization of how the values defined in the RDF graph are mapped to CKAN and viceversa. -Here is a quick overview of how it works: +Profiles define : -```python +* How the RDF graph values map to CKAN fields, i.e. how the RDF is parsed into CKAN datasets +* How CKAN fields map to an RDF graph, which can be then serialized +* How the CKAN catalog metadata maps to an RDF graph, which can be then serialized - from ckanext.dcat.processors import RDFParser, RDFParserException +They essentially define the mapping between DCAT and CKAN. - parser = RDFParser() +In most cases the default profile will provide a good mapping that will cover most properties described in the DCAT standard. If you want to extract extra fields defined in the RDF, are using a custom schema or +need custom logic, you can write a [custom profile](#writing-custom-profiles) that extends or replaces one of the default ones. - # Parsing a local RDF/XML file +The profiles currently shipped with the extension are mostly based in the +[DCAT application profile for data portals in Europe](https://joinup.ec.europa.eu/asset/dcat_application_profile/description). As mentioned before though, they should be generic enough for most DCAT based representations. - with open('datasets.rdf', 'r') as f: - try: - parser.parse(f.read()) +Sites that want to support a particular version of the DCAT-AP can enable a specific profile using one of the profiles below: - for dataset in parser.datasets(): - print('Got dataset with title {0}'.format(dataset['title']) +* [DCAT-AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0) (default): `euro_dcat_ap_3` +* [DCAT-AP v2.1.0](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210): `euro_dcat_ap_2` +* [DCAT-AP v1.1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11): `euro_dcat_ap` - except RDFParserException, e: - print ('Error parsing the RDF file: {0}'.format(e)) +This plugin also contains a profile to serialize a CKAN dataset to a [schema.org Dataset](http://schema.org/Dataset) called `schemaorg`. This is especially useful to provide [JSON-LD structured data](google-dataset-search.md). - # Parsing a remote JSON-LD file +To define which profiles to use you can: - import requests - parser = RDFParser() +1. Set the [`ckanext.dcat.rdf.profiles`](configuration.md#ckanextdcatrdfprofiles) configuration option on your CKAN configuration file: - content = requests.get('https://some.catalog.org/datasets.jsonld').content +```ini +ckanext.dcat.rdf.profiles = euro_dcat_ap sweden_dcat_ap +``` - try: - parser.parse(content, _format='json-ld') +2. When initializing a parser or serializer class, pass the profiles to be used as a parameter, eg: - for dataset in parser.datasets(): - print('Got dataset with title {0}'.format(dataset['title']) +```python - except RDFParserException, e: - print ('Error parsing the RDF file: {0}'.format(e)) +parser = RDFParser(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) +serializer = RDFSerializer(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) ``` -The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Python library for working with RDF. Any -RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains -serializations in different formats including RDF/XML, Turtle or JSON-LD. +Note that in both cases the order in which you define them is important, as it will be the one that the profiles will be run on. -## RDF DCAT Serializer -The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different -formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). +### Writing custom profiles -Here is an example of how to use it: +Internally, profiles are classes that define a particular set of methods called during the parsing process. +For instance, the `parse_dataset()` method is called on each DCAT dataset found when parsing an RDF file, and should return a CKAN dataset. +Conversely, the `graph_from_dataset()` will be called when requesting an RDF representation for a dataset, and will need to generate the necessary RDF graph. -```python +Custom profiles should always extend the `ckanext.dcat.profiles.RDFProfile` class. This class has several helper +functions to make getting metadata from the RDF graph easier. These include helpers for getting fields for FOAF and VCard entities like the ones +used to define publishers or contact points. Check the source code of `ckanex.dcat.profiles.base.py` to see what is available. - from ckanext.dcat.processors import RDFSerializer +Profiles can extend other profiles to avoid repeating rules, or can be completely independent. - # Serializing a single dataset +The following example shows a complete example of a profile built on top of the European DCAT-AP profile (`euro_dcat_ap`): - dataset = get_action('package_show')({}, {'id': 'my-dataset'}) +```python - serializer = RDFserializer() +from rdflib.namespace import Namespace +from ckanext.dcat.profiles import RDFProfile - dataset_ttl = serializer.serialize_dataset(dataset, _format='turtle') +DCT = Namespace("http://purl.org/dc/terms/") - # Serializing the whole catalog (or rather part of it) +class SwedishDCATAPProfile(RDFProfile): + ''' + An RDF profile for the Swedish DCAT-AP recommendation for data portals - datasets = get_action('package_search')({}, {'q': '*:*', 'rows': 50}) + It requires the European DCAT-AP profile (`euro_dcat_ap`) + ''' - serializer = RDFserializer() + def parse_dataset(self, dataset_dict, dataset_ref): - catalog_xml = serializer.serialize_catalog({'title': 'My catalog'}, - dataset_dicts=datasets, - _format='xml') + # Spatial label + spatial = self._object(dataset_ref, DCT.spatial) + if spatial: + spatial_label = self.g.label(spatial) + if spatial_label: + dataset_dict['extras'].append({'key': 'spatial_text', + 'value': str(spatial_label)}) - # Creating and RDFLib graph from a single dataset + return dataset_dict - dataset = get_action('package_show')({}, {'id': 'my-dataset'}) + def graph_from_dataset(self, dataset_dict, dataset_ref): - serializer = RDFserializer() + g = self.g - dataset_reference = serializer.graph_from_dataset(dataset) + spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') + spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') - # serializer.g now contains the full dataset graph, an RDFLib Graph class + if spatial_uri: + spatial_ref = URIRef(spatial_uri) + else: + spatial_ref = BNode() + if spatial_text: + g.add((dataset_ref, DCT.spatial, spatial_ref)) + g.add((spatial_ref, RDF.type, DCT.Location)) + g.add((spatial_ref, RDFS.label, Literal(spatial_text))) ``` -The serializer uses customizable [profiles](#profiles) to generate an RDF graph (an [RDFLib Graph class](https://rdflib.readthedocs.org/en/latest/apidocs/rdflib.html#rdflib.graph.Graph)). -By default these use the [mapping](#rdf-dcat-to-ckan-dataset-mapping) described in the previous section. +Note how the dataset dict is passed between profiles so it can be further tweaked. -In some cases, if the default CKAN field that maps to a DCAT property is not present, some other fallback -values will be used instead. For instance, if the `contact_email` field is not found, `maintainer_email` -and `author_email` will be used (if present) for the email property of the `adms:contactPoint` property. +Extensions define their available profiles using the `ckan.rdf.profiles` entrypoint in the `setup.py` file, as in this [example](https://github.com/ckan/ckanext-dcat/blob/cc5fcc7be0be62491301db719ce597aec7c684b0/setup.py#L37:L38) from this same extension: -Note that the serializer will look both for a first level field or an extra field with the same key, ie both -the following values will be used for `dct:accrualPeriodicity`: + [ckan.rdf.profiles] + euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile + euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile + euro_dcat_ap_3=ckanext.dcat.profiles:EuropeanDCATAP3Profile + euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile + schemaorg=ckanext.dcat.profiles:SchemaOrgProfile - { - "name": "my-dataset", - "frequency": "monthly", - ... - } +## RDF DCAT Parser - { - "name": "my-dataset", - "extras": [ - {"key": "frequency", "value": "monthly"}, - ] - ... - } +The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different +formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions +and create CKAN datasets and resources, as dictionaries that can be passed to [`package_create`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.create.package_create) or [`package_update`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.update.package_update). -Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), -so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). +Here is a quick overview of how it works: -### Inherit license from the dataset as fallback in distributions -It is possible to inherit the license from the dataset to the distributions, but only if there is no license defined in the resource yet. By default the license is not inherited from the dataset. This can be activated by setting the following parameter in the CKAN config file: +```python - ckanext.dcat.resource.inherit.license = True +from ckanext.dcat.processors import RDFParser, RDFParserException +parser = RDFParser() -## Profiles +# Parsing a local RDF/XML file -Both the parser and the serializer use profiles to allow customization of how the values defined in the RDF graph are mapped to CKAN and viceversa. +with open('datasets.rdf', 'r') as f: + try: + parser.parse(f.read()) -Profiles define : + for dataset in parser.datasets(): + print('Got dataset with title {0}'.format(dataset['title']) -* How the RDF graph values map into CKAN fields, ie how the RDF is parsed into CKAN datasets -* How CKAN fields map to an RDF graph, which can be then serialized -* How the CKAN catalog metadata maps to an RDF graph, which can be then serialized + except RDFParserException, e: + print ('Error parsing the RDF file: {0}'.format(e)) -They essentially define the mapping between DCAT and CKAN. +# Parsing a remote JSON-LD file -In most cases the default profile will provide a good mapping that will cover most properties described in the DCAT standard. If you want to extract extra fields defined in the RDF, are using a custom schema or -need custom logic, you can write a custom to profile that extends or replaces the default one. +import requests -The default profile is mostly based in the -[DCAT application profile for data portals in Europe](https://joinup.ec.europa.eu/asset/dcat_application_profile/description). It is actually fully-compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), and partially compatible with [DCAT-AP v2.1.0](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). As mentioned before though, it should be generic enough for most DCAT based representations. +parser = RDFParser() -Sites that want to support a particular version of the DCAT-AP can enable a specific profile using one of the methods below: +content = requests.get('https://some.catalog.org/datasets.jsonld').content -* DCAT-AP v2.1.0 (default): `euro_dcat_ap_2` -* DCAT-AP v1.1.1: `euro_dcat_ap` +try: + parser.parse(content, _format='json-ld') -This plugin also contains a profile to serialize a CKAN dataset to a [schema.org Dataset](http://schema.org/Dataset) called `schemaorg`. This is especially useful to provide [JSON-LD structured data](#structured-data). + for dataset in parser.datasets(): + print('Got dataset with title {0}'.format(dataset['title']) -To define which profiles to use you can: +except RDFParserException, e: + print ('Error parsing the RDF file: {0}'.format(e)) -1. Set the `ckanext.dcat.rdf.profiles` configuration option on your CKAN configuration file: +``` - ckanext.dcat.rdf.profiles = euro_dcat_ap sweden_dcat_ap +The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Python library for working with RDF. Any +RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains +serializations in different formats including RDF/XML, Turtle or JSON-LD. -2. When initializing a parser or serializer class, pass the profiles to be used as a parameter, eg: +## RDF DCAT Serializer + +The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different +formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). + +Here is an example of how to use it: ```python - parser = RDFParser(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) +from ckanext.dcat.processors import RDFSerializer - serializer = RDFSerializer(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) -``` +# Serializing a single dataset -Note that in both cases the order in which you define them is important, as it will be the one that the profiles will be run on. +dataset = get_action('package_show')({}, {'id': 'my-dataset'}) +serializer = RDFserializer() -### Writing custom profiles +dataset_ttl = serializer.serialize_dataset(dataset, _format='turtle') -Internally, profiles are classes that define a particular set of methods called during the parsing process. -For instance, the `parse_dataset` method is called on each DCAT dataset found when parsing an RDF file, and should return a CKAN dataset. -Conversely, the `graph_from_dataset` will be called when requesting an RDF representation for a dataset, and will need to generate the necessary RDF graph. -Custom profiles should always extend the `ckanext.dcat.profiles.RDFProfile` class. This class has several helper -functions to make getting metadata from the RDF graph easier. These include helpers for getting fields for FOAF and VCard entities like the ones -used to define publishers or contact points. Check the source code of `ckanex.dcat.profiles.py` to see what is available. +# Serializing the whole catalog (or rather part of it) -Profiles can extend other profiles to avoid repeating rules, or can be completely independent. +datasets = get_action('package_search')({}, {'q': '*:*', 'rows': 50}) -The following example shows a complete example of a profile built on top of the European DCAT-AP profile (`euro_dcat_ap`): +serializer = RDFserializer() -```python +catalog_xml = serializer.serialize_catalog({'title': 'My catalog'}, + dataset_dicts=datasets, + _format='xml') - from rdflib.namespace import Namespace - from ckanext.dcat.profiles import RDFProfile +# Creating and RDFLib graph from a single dataset - DCT = Namespace("http://purl.org/dc/terms/") +dataset = get_action('package_show')({}, {'id': 'my-dataset'}) +serializer = RDFserializer() - class SwedishDCATAPProfile(RDFProfile): - ''' - An RDF profile for the Swedish DCAT-AP recommendation for data portals +dataset_reference = serializer.graph_from_dataset(dataset) - It requires the European DCAT-AP profile (`euro_dcat_ap`) - ''' +# serializer.g now contains the full dataset graph, an RDFLib Graph class - def parse_dataset(self, dataset_dict, dataset_ref): +``` - # Spatial label - spatial = self._object(dataset_ref, DCT.spatial) - if spatial: - spatial_label = self.g.label(spatial) - if spatial_label: - dataset_dict['extras'].append({'key': 'spatial_text', - 'value': str(spatial_label)}) +The serializer uses customizable [profiles](#profiles) to generate an RDF graph (an [RDFLib Graph class](https://rdflib.readthedocs.org/en/latest/apidocs/rdflib.html#rdflib.graph.Graph)). +By default these use the [mapping](mapping.md) described in the previous section. - return dataset_dict +In some cases, if the default CKAN field that maps to a DCAT property is not present, some other fallback +values will be used instead. For instance, if the `contact_email` field is not found, `maintainer_email` +and `author_email` will be used (if present) for the email property of the `adms:contactPoint` property. - def graph_from_dataset(self, dataset_dict, dataset_ref): +Note that the serializer will look both for a first level field or an extra field with the same key, ie both +the following values will be used for `dct:accrualPeriodicity`: - g = self.g + { + "name": "my-dataset", + "frequency": "monthly", + ... + } - spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') - spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') + { + "name": "my-dataset", + "extras": [ + {"key": "frequency", "value": "monthly"}, + ] + ... + } - if spatial_uri: - spatial_ref = URIRef(spatial_uri) - else: - spatial_ref = BNode() +Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), +so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). - if spatial_text: - g.add((dataset_ref, DCT.spatial, spatial_ref)) - g.add((spatial_ref, RDF.type, DCT.Location)) - g.add((spatial_ref, RDFS.label, Literal(spatial_text))) -``` +### Inherit license from the dataset as fallback in distributions +It is possible to inherit the license from the dataset to the distributions, but only if there is no license defined in the resource yet. By default the license is not inherited from the dataset. This can be activated by setting the following parameter in the CKAN config file: -Note how the dataset dict is passed between profiles so it can be further tweaked. + ckanext.dcat.resource.inherit.license = True -Extensions define their available profiles using the `ckan.rdf.profiles` in the `setup.py` file, as in this [example](https://github.com/ckan/ckanext-dcat/blob/cc5fcc7be0be62491301db719ce597aec7c684b0/setup.py#L37:L38) from this same extension: - [ckan.rdf.profiles] - euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile - euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile - schemaorg=ckanext.dcat.profiles:SchemaOrgProfile diff --git a/mkdocs.yml b/mkdocs.yml index 2daa443a..918baea8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -38,12 +38,24 @@ theme: - search.suggest - search.highlight - toc.integrate + - content.code.copy plugins: - search markdown_extensions: - - admonition + - toc: + permalink: true + - admonition + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + extra_css: - _css/extra.css @@ -53,7 +65,7 @@ nav: - Getting started: 'getting-started.md' - DCAT support: - 'endpoints.md' - - DCAT CKAN mapping: 'mapping.md' + - DCAT ↔ CKAN mapping: 'mapping.md' - 'profiles.md' - Other features: - 'harvester.md' From 7ad5239ec49ebbbbbd71072c549e003ddd7a68d8 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 29 Aug 2024 13:25:23 +0200 Subject: [PATCH 5/8] More dcat ap 3 mentions in landing page --- docs/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index ff7c7e3d..11130028 100644 --- a/docs/index.md +++ b/docs/index.md @@ -103,7 +103,7 @@ Ckanext-dcat is a [CKAN](https://github.com/ckan/ckan) extension that helps data In terms of CKAN features, this extension offers: -* [Pre-built CKAN schemas](getting-started.md#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the -box DCAT support in data portals. +* [Pre-built CKAN schemas](getting-started.md#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the-box DCAT support in data portals (currently supporting DCAT AP v1, v2, and v3). * [DCAT Endpoints](endpoints.md) that expose the catalog datasets in different RDF serializations (`dcat` plugin). @@ -114,7 +114,7 @@ In terms of CKAN features, this extension offers: These are implemented internally using: -* A base [mapping](mapping.md) between DCAT and CKAN datasets and viceversa (compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210)). +* A base [mapping](mapping.md) between DCAT and CKAN datasets and viceversa (compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210) and [DCAT-AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/)). * An [RDF Parser](profiles.md#rdf-dcat-parser) that allows to read RDF serializations in different formats and extract CKAN dataset dicts, using customizable [profiles](profiles.md#profiles). From 35df80f1aa3746dae0210da109354ddbb86afbc3 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 29 Aug 2024 13:30:29 +0200 Subject: [PATCH 6/8] Tweak schemas point --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 11130028..de9d0ad5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -103,7 +103,7 @@ Ckanext-dcat is a [CKAN](https://github.com/ckan/ckan) extension that helps data In terms of CKAN features, this extension offers: -* [Pre-built CKAN schemas](getting-started.md#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the-box DCAT support in data portals (currently supporting DCAT AP v1, v2, and v3). +* [Pre-built CKAN schemas](getting-started.md#schemas) for common Application Profiles that can be adapted to each site requirements to provide out-of-the-box DCAT support in data portals, including tailored form fields, validation etc. (currently supporting DCAT AP v1, v2, and v3). * [DCAT Endpoints](endpoints.md) that expose the catalog datasets in different RDF serializations (`dcat` plugin). From 0484af84f08ae40f2f24f3307452afe0a9ee5d65 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 29 Aug 2024 13:58:45 +0200 Subject: [PATCH 7/8] Update README --- README.md | 1229 +---------------------------------------------------- 1 file changed, 12 insertions(+), 1217 deletions(-) diff --git a/README.md b/README.md index b8d9cce5..02a0aa13 100644 --- a/README.md +++ b/README.md @@ -5,1242 +5,37 @@ [![Code Coverage](http://codecov.io/github/ckan/ckanext-dcat/coverage.svg?branch=master)](http://codecov.io/github/ckan/ckanext-dcat?branch=master) -This extension provides plugins that allow CKAN to expose its metadata and consume metadata from other catalogs using RDF documents serialized using DCAT. The Data Catalog Vocabulary (DCAT) is "an RDF vocabulary designed to facilitate interoperability between data catalogs published on the Web". More information can be found on the following W3C page: +Ckanext-dcat is a [CKAN](https://github.com/ckan/ckan) extension that helps data publishers expose and consume metadata as serialized RDF documents using [DCAT](https://github.com/ckan/ckan). -[http://www.w3.org/TR/vocab-dcat](http://www.w3.org/TR/vocab-dcat) -It also offers other features related to Semantic Data like exposing the necessary markup to get your datasets indexed in [Google Dataset Search](https://toolbox.google.com/datasetsearch). +> [!IMPORTANT] +> Read the documentation for a full user guide: +> https://docs.ckan.org/projects/ckanext-dcat -Check the [overview](#overview) section for a summary of the available features. - - -## Contents - - - -- [Overview](#overview) -- [Installation](#installation) -- [Schemas](#schemas) - * [Compatibility with existing profiles](#compatibility-with-existing-profiles) -- [RDF DCAT endpoints](#rdf-dcat-endpoints) - * [Dataset endpoints](#dataset-endpoints) - * [Catalog endpoint](#catalog-endpoint) - * [URIs](#uris) - * [Content negotiation](#content-negotiation) -- [RDF DCAT harvester](#rdf-dcat-harvester) - * [Maximum file size](#maximum-file-size) - * [Transitive harvesting](#transitive-harvesting) - * [Extending the RDF harvester](#extending-the-rdf-harvester) -- [JSON DCAT harvester](#json-dcat-harvester) -- [RDF DCAT to CKAN dataset mapping](#rdf-dcat-to-ckan-dataset-mapping) - * [Custom fields](#custom-fields) - * [URIs](#uris-1) - * [Lists](#lists) - * [Contact points and Publisher](#contact-points-and-publisher) - * [Spatial coverage](#spatial-coverage) - * [Licenses](#licenses) -- [RDF DCAT Parser](#rdf-dcat-parser) -- [RDF DCAT Serializer](#rdf-dcat-serializer) - * [Inherit license from the dataset as fallback in distributions](#inherit-license-from-the-dataset-as-fallback-in-distributions) -- [Profiles](#profiles) - * [Writing custom profiles](#writing-custom-profiles) - * [Command line interface](#command-line-interface) - * [Compatibility mode](#compatibility-mode) -- [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated) -- [Translation of fields](#translation-of-fields) -- [Structured data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) -- [CLI](#cli) -- [Configuration reference](#configuration-reference) -- [Running the Tests](#running-the-tests) -- [Releases](#releases) -- [Acknowledgements](#acknowledgements) -- [Copying and License](#copying-and-license) - - ## Overview -[DCAT](http://www.w3.org/TR/vocab-dcat) has become the basis for many metadata sharing standards, like DCAT-AP and DCAT-US for data portals in Europe and the USA respectively. This extension aims to provide tools and guidance to allow publishers to publish and share DCAT based metadata easily. - In terms of CKAN features, this extension offers: -* [Pre-built CKAN schemas](#schemas) for common Application Profiles that can be adapted to each site requirement to provide out-of-the -box DCAT support in data portals. +* [Pre-built CKAN schemas](https://docs.ckan.org/projects/ckanext-dcat/en/latest/getting-started#schemas) for common Application Profiles that can be adapted to each site requirements to provide out-of-the-box DCAT support in data portals, including tailored form fields, validation etc. (currently supporting DCAT AP v1, v2, and v3). -* [RDF DCAT Endpoints](#rdf-dcat-endpoints) that expose the catalog's datasets in different RDF serializations (`dcat` plugin). +* [DCAT Endpoints](https://docs.ckan.org/projects/ckanext-dcat/en/latest/endpoints) that expose the catalog datasets in different RDF serializations (`dcat` plugin). -* An [RDF Harvester](#rdf-dcat-harvester) that allows importing RDF serializations from other catalogs to create CKAN datasets (`dcat_rdf_harvester` plugin). +* An [RDF Harvester](https://docs.ckan.org/projects/ckanext-dcat/en/latest/harvester) that allows importing RDF serializations from other catalogs to create CKAN datasets (`dcat_rdf_harvester` plugin). -* An [JSON DCAT Harvester](#json-dcat-harvester) that allows importing JSON objects that are based on DCAT terms but are not defined as JSON-LD, using the serialization described in the [spec.dataportals.org](http://spec.dataportals.org/#datasets-serialization-format) site (`dcat_json_harvester` plugin).. +* Other features like [Command Line Interface](https://docs.ckan.org/projects/ckanext-dcat/en/latest/cli) or support for indexing in [Google Dataset Search](https://docs.ckan.org/projects/ckanext-dcat/en/latest/google-dataset-search). These are implemented internally using: -* A base [mapping](#rdf-dcat-to-ckan-dataset-mapping) between DCAT and CKAN datasets and viceversa (compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210)). - -* An [RDF Parser](#rdf-dcat-parser) that allows to read RDF serializations in different formats and extract CKAN dataset dicts, using customizable [profiles](#profiles). - -* An [RDF Serializer](#rdf-dcat-serializer) that allows to transform CKAN datasets metadata to different semantic formats, also allowing customizable [profiles](#profiles). - - - -## Installation - - -1. Install the extension on your virtualenv: - - (pyenv) $ pip install -e git+https://github.com/ckan/ckanext-dcat.git#egg=ckanext-dcat - -2. Install the extension requirements: - - (pyenv) $ pip install -r ckanext-dcat/requirements.txt - -3. Enable the required plugins in your ini file: - - ckan.plugins = dcat dcat_rdf_harvester structured_data - -4. To use the pre-built schemas, install [ckanext-scheming](https://github.com/ckan/ckanext-scheming): - - pip install -e "git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming" - -Check the [Schemas](#schemas) section for extra configuration needed. - -Optionally, if you want to use the RDF harvester, install ckanext-harvest as well ([https://github.com/ckan/ckanext-harvest#installation](https://github.com/ckan/ckanext-harvest#installation)). - -For further configuration options available, see [Configuration reference](#configuration-reference). - - - - -## Schemas - -The extension includes ready to use [ckanext-scheming](https://github.com/ckan/ckanext-scheming) schemas that enable DCAT support. These include a schema definition file (located in `ckanext/dcat/schemas`) plus extra validators and other custom logic that integrates the metadata modifications with the RDF DCAT [Parsers](#rdf-dcat-parser) and [Serializers](#rdf-dcat-serializer) and other CKAN features and extensions. - -There are the following schemas currently included with the extension: - -* *dcat_ap_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the DCAT AP specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. -* *dcat_ap_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT AP 2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) and [DCAT AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/) specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. - -Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](#writing-custom-profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. - -In any case, the schema file used should be defined in the configuration file, alongside these configuration options: - - # Make sure to add scheming_datasets after the dcat plugin - ckan.plugins = activity dcat [...] scheming_datasets - - # Point to one of the defaults or your own version of the schema file - scheming.dataset_schemas = ckanext.dcat.schemas:dcat_ap_2.1_recommended.yaml - - # Include the dcat presets as well as the standard scheming ones - scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml - - # Sites using the euro_dcat_ap and euro_dcat_ap_2 profiles must add the - # euro_dcat_ap_scheming profile if they want to use ckanext-scheming schemas (see next section) - ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming - -### Compatibility with existing profiles - -Sites using the existing `euro_dcat_ap` and `euro_dcat_ap_2` profiles should not see any change in their -current parsing and serialization functionalities and these profiles will not change their outputs going -forward (unless a bug is being fixed). Sites willing to migrate to a scheming based metadata schema can do -so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. -`ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile -outputs to the expected format by the scheming validators. - -Note that the scheming profile will only affect fields defined in the schema definition file, so sites can start migrating gradually different metadata fields. - - - -## RDF DCAT endpoints - -By default when the `dcat` plugin is enabled, the following RDF endpoints are available on your CKAN instance. The schema used on the serializations can be customized using [profiles](#profiles). - -To disable the RDF endpoints, you can set the following config in your ini file: - - ckanext.dcat.enable_rdf_endpoints = False - - -### Dataset endpoints - -RDF representations of a particular dataset can accessed using the following endpoint: - - https://{ckan-instance-host}/dataset/{dataset-id}.{format} - -The extension will determine the RDF serialization format returned. The currently supported values are: - -| Extension | Format | Media Type | -|-----------|-------------------------------------------------------------|---------------------| -| `xml` | [RDF/XML](https://en.wikipedia.org/wiki/RDF/XML) | application/rdf+xml | -| `ttl` | [Turtle](https://en.wikipedia.org/wiki/Turtle_%28syntax%29) | text/turtle | -| `n3` | [Notation3](https://en.wikipedia.org/wiki/Notation3) | text/n3 | -| `jsonld` | [JSON-LD](http://json-ld.org/) | application/ld+json | - -The fallback `rdf` format defaults to RDF/XML. - -Here's an example of the different formats: - -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.rdf -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.ttl -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.n3 -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld - -RDF representations will be advertised using `` tags on the `` sectionon the dataset page source code, eg: - - - - - - - - - - -Check the [RDF DCAT Serializer](#rdf-dcat-serializer) section for more details about how these are generated and how to customize the output using [profiles](#profiles). - - -You can specify the profile by using the `profiles=,` query parameter on the dataset endpoint (as a comma-separated list): - -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.xml?profiles=euro_dcat_ap -* https://opendata.swiss/en/dataset/verbreitung-der-steinbockkolonien.jsonld?profiles=schemaorg - -*Note*: When using this plugin, the above endpoints will replace the old deprecated ones that were part of CKAN core. - - -### Catalog endpoint - -Additionally to the individual dataset representations, the extension also offers a catalog-wide endpoint for retrieving multiple datasets at the same time (the datasets are paginated, see below for details): - - https://{ckan-instance-host}/catalog.{format}?[page={page}]&[modified_since={date}]&[profiles={profile1},{profile2}]&[q={query}]&[fq={filter query}] - -This endpoint can be customized if necessary using the `ckanext.dcat.catalog_endpoint` configuration option, eg: - - ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} - -The custom endpoint **must** start with a forward slash (`/`) and contain the `{_format}` placeholder. - -As described previously, the extension will determine the RDF serialization format returned. - -* http://demo.ckan.org/catalog.rdf -* http://demo.ckan.org/catalog.xml -* http://demo.ckan.org/catalog.ttl - -RDF representations will be advertised using `` tags on the `` sectionon the homepage and the dataset search page source code, eg: - - - - - - - - - - - -The number of datasets returned is limited. The response will include paging info, serialized using the [Hydra](http://www.w3.org/ns/hydra/spec/latest/core/) vocabulary. The different terms are self-explanatory, and can be used by clients to iterate the catalog: - - @prefix hydra: . - - a hydra:PagedCollection ; - hydra:first "http://example.com/catalog.ttl?page=1" ; - hydra:last "http://example.com/catalog.ttl?page=3" ; - hydra:next "http://example.com/catalog.ttl?page=2" ; - hydra:totalItems 283 . - -The default number of datasets returned (100) can be modified by CKAN site maintainers using the following configuration option on your ini file: - - ckanext.dcat.datasets_per_page = 20 - -The catalog endpoint also supports a `modified_since` parameter to restrict datasets to those modified from a certain date. The parameter value should be a valid ISO-8601 date: - -http://demo.ckan.org/catalog.xml?modified_since=2015-07-24 - -It's possible to specify the profile(s) to use for the serialization using the `profiles` parameter: - -http://demo.ckan.org/catalog.xml?profiles=euro_dcat_ap,sweden_dcat_ap - -To filter the output, the catalog endpoint supports the `q` and `fq` parameters to specify a [search query](https://lucene.apache.org/solr/guide/6_6/the-dismax-query-parser.html#TheDisMaxQueryParser-TheqParameter) or [filter query](https://lucene.apache.org/solr/guide/6_6/common-query-parameters.html#CommonQueryParameters-Thefq_FilterQuery_Parameter): - -http://demo.ckan.org/catalog.xml?q=budget -http://demo.ckan.org/catalog.xml?fq=tags:economy - - - -### URIs - -Whenever possible, URIs are generated for the relevant entities. To try to generate them, the extension will use the first found of the following for each entity: - -* Catalog: - - `ckanext.dcat.base_uri` configuration option value. This is the recommended approach. Value should be a valid URI - - `ckan.site_url` configuration option value. - - 'http://' + `app_instance_uuid` configuration option value. This is not recommended, and a warning log message will be shown. - -* Dataset: - - The value of the `uri` field (note that this is not included in the default CKAN schema) - - The value of an extra with key `uri` - - Catalog URI (see above) + '/dataset/' + `id` field - -* Resource: - - The value of the `uri` field (note that this is not included in the default CKAN schema) - - Catalog URI (see above) + '/dataset/' + `package_id` field + '/resource/ + `id` field - -Note that if you are using the [RDF DCAT harvester](#rdf-dcat-harvester) to import datasets from other catalogs and these define a proper URI for each dataset or resource, these will be stored as `uri` fields in your instance, and thus used when generating serializations for them. - - -### Content negotiation - -The extension supports returning different representations of the datasets based on the value of the `Accept` header ([Content negotiation](https://en.wikipedia.org/wiki/Content_negotiation)). - -When enabled, client applications can request a particular format via the `Accept` header on requests to the main dataset page, eg: - - curl https://{ckan-instance-host}/dataset/{dataset-id} -H Accept:text/turtle - - curl https://{ckan-instance-host}/dataset/{dataset-id} -H Accept:"application/rdf+xml; q=1.0, application/ld+json; q=0.6" - -This is also supported on the [catalog endpoint](#catalog-endpoint), in this case when making a request to the CKAN root URL (home page). This won't support the pagination and filter parameters: - - curl https://{ckan-instance-host} -H Accept:text/turtle - -Note that this feature overrides the CKAN core home page and dataset page controllers, so you probably don't want to enable it if your own extension is also doing it. - -To enable content negotiation, set the following configuration option on your ini file: - - ckanext.dcat.enable_content_negotiation = True - - -## RDF DCAT harvester - -The RDF parser described in the previous section has been integrated into a harvester, -to allow automatic import of datasets from remote sources. To enable the RDF harvester, add the `dcat_rdf_harvester` plugin to your CKAN configuration file: - - ckan.plugins = ... dcat_rdf_harvester - -The harvester will download the remote file, extract all datasets using the parser and create or update actual CKAN datasets based on that. -It will also handle deletions, ie if a dataset is not present any more in the DCAT dump anymore it will get deleted from CKAN. - -The harvester will look at the `content-type` HTTP header field to determine the used RDF format. Any format understood by the [RDFLib](https://rdflib.readthedocs.org/en/stable/plugin_parsers.html) library can be parsed. It is possible to override this functionality and provide a specific format using the harvester configuration. This is useful when the server does not return the correct `content-type` or when harvesting a file on the local file system without a proper extension. The harvester configuration is a JSON object that you fill into the harvester configuration form field. - - {"rdf_format":"text/turtle"} - -*TODO*: configure profiles. - -### Maximum file size - -The default max size of the file (for each HTTP response) to harvest is actually 50 MB. The size can be customised by setting the configuration option `ckanext.dcat.max_file_size` to your CKAN configuration file. -Here‘s an example of setting the max file size to 100 MB: - -`ckanext.dcat.max_file_size = 100` - -### Transitive harvesting - -In transitive harvesting (i.e., when you harvest a catalog A, and a catalog X harvests your catalog), you may want to provide the original catalog info for each harvested dataset. - -By setting the configuration option `ckanext.dcat.expose_subcatalogs = True` in your ini file, you'll enable the storing and publication of the source catalog for each harvested dataset. - -The information contained in the harvested `dcat:Catalog` node will be stored as extras into the harvested datasets. -When serializing, your Catalog will expose the harvested Catalog using the `dct:hasPart` relation. This means that your catalog will have this structure: -- `dcat:Catalog` (represents your current catalog) - - `dcat:dataset` (1..n, the dataset created withing your catalog) - - `dct:hasPart` - - `dcat:Catalog` (info of one of the harvested catalogs) - - `dcat:dataset` (dataset in the harvested catalog) - - `dct:hasPart` - - `dcat:Catalog` (info of one of another harvester catalog) - ... - - -### Extending the RDF harvester - -The DCAT RDF harvester has extension points that allow to modify its behaviour from other extensions. These can be used by extensions implementing -the `IDCATRDFHarvester` interface. Right now it provides the following methods: - -* `before_download` and `after_download`: called just before and after retrieving the remote file, and can be used for instance to validate the contents. -* `update_session`: called before making the remote requests to update the `requests` session object, useful to add additional headers or for setting client certificates. Check the [`requests` documentation](http://docs.python-requests.org/en/master/user/advanced/#session-objects) for details. -* `before_create` / `after_create`: called before and after the `package_create` action has been performed -* `before_update` / `after_update`: called before and after the `package_update` action has been performed -* `after_parsing`: Called just after the content from the remote RDF file has been parsed - -To know more about these methods, please check the source of [`ckanext-dcat/ckanext/dcat/interfaces.py`](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/interfaces.py). - -## JSON DCAT harvester - -The DCAT JSON harvester supports importing JSON objects that are based on DCAT terms but are not defined as JSON-LD. The exact format for these JSON files -is the one described in the [spec.dataportals.org](http://spec.dataportals.org/#datasets-serialization-format) site. There are [example files](https://github.com/ckan/ckanext-dcat/blob/master/examples/dataset.json) in the `examples` folder. - -To enable the JSON harvester, add the `dcat_json_harvester` plugin to your CKAN configuration file: - - ckan.plugins = ... dcat_json_harvester - -*TODO*: align the fields created by this harvester with the base mapping (ie the ones created by the RDF harvester). - -## RDF DCAT to CKAN dataset mapping - -The following table provides a generic mapping between the fields of the `dcat:Dataset` and `dcat:Distribution` classes and -their equivalents in the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link -the DCAT publisher property with a CKAN dataset author, maintainer or organization, as the link between them is not straight-forward -and may depend on a particular instance needs. When mapping from CKAN metadata to DCAT though, there are in some cases fallback fields -that are used if the default field is not present (see [RDF Serializer](#rdf-dcat-serializer) for more details on this. - -This mapping is compatible with the [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11) and [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210). It depends on the active profile(s) (see [Profiles](#profiles)) which DCAT properties are mapped. - -Sites are encouraged to use ckanext-scheming to manage their metadata schema (see [Schemas](#schemas) for all details). This changes in -some cases the way metadata is stored internally and presented at the CKAN API level, but should not affect the RDF DCAT output. - -| DCAT class | DCAT property | CKAN dataset field | CKAN fallback fields | Stored as | | -|-------------------|------------------------|-------------------------------------------|--------------------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| dcat:Dataset | - | extra:uri | | text | See [URIs](#uris-1) | -| dcat:Dataset | dct:title | title | | text | | -| dcat:Dataset | dct:description | notes | | text | | -| dcat:Dataset | dcat:keyword | tags | | text | | -| dcat:Dataset | dcat:theme | extra:theme | | list | See [Lists](#lists) | -| dcat:Dataset | dct:identifier | extra:identifier | extra:guid, id | text | | -| dcat:Dataset | adms:identifier | extra:alternate_identifier | | text | | -| dcat:Dataset | dct:issued | extra:issued | metadata_created | text | | -| dcat:Dataset | dct:modified | extra:modified | metadata_modified | text | | -| dcat:Dataset | owl:versionInfo | version | extra:dcat_version | text | | -| dcat:Dataset | adms:versionNotes | extra:version_notes | | text | | -| dcat:Dataset | dct:language | extra:language | | list | See [Lists](#lists) | -| dcat:Dataset | dcat:landingPage | url | | text | | -| dcat:Dataset | dct:accrualPeriodicity | extra:frequency | | text | | -| dcat:Dataset | dct:conformsTo | extra:conforms_to | | list | See [Lists](#lists) | -| dcat:Dataset | dct:accessRights | extra:access_rights | | text | | -| dcat:Dataset | foaf:page | extra:documentation | | list | See [Lists](#lists) | -| dcat:Dataset | dct:provenance | extra:provenance | | text | | -| dcat:Dataset | dct:type | extra:dcat_type | | text | As of DCAT-AP v1.1 there's no controlled vocabulary for this field | -| dcat:Dataset | dct:hasVersion | extra:has_version | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:isVersionOf | extra:is_version_of | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | dct:source | extra:source | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to another dcat:Dataset | -| dcat:Dataset | adms:sample | extra:sample | | list | See [Lists](#lists). It is assumed that these are one or more URIs referring to dcat:Distribution instances | -| dcat:Dataset | dct:spatial | extra:spatial_uri | | text | See [Spatial coverage](#spatial-coverage) | -| dcat:Dataset | dct:temporal | extra:temporal_start + extra:temporal_end | | text | None, one or both extras can be present | -| dcat:Dataset | dcat:temporalResolution| extra:temporal_resolution | | list | | -| dcat:Dataset | dcat:spatialResolutionInMeters| extra:spatial_resolution_in_meters | | list | | -| dcat:Dataset | dct:isReferencedBy | extra:is_referenced_by | | list | | -| dcat:Dataset | dct:publisher | extra:publisher_uri | | text | See [URIs](#uris-1) and [Publisher](#contact-points-and-publisher) | -| foaf:Agent | foaf:name | extra:publisher_name | | text | | -| foaf:Agent | foaf:mbox | extra:publisher_email | organization:title | text | | -| foaf:Agent | foaf:homepage | extra:publisher_url | | text | | -| foaf:Agent | dct:type | extra:publisher_type | | text | | -| dcat:Dataset | dcat:contactPoint | extra:contact_uri | | text | See [URIs](#uris-1) and [Contact points](#contact-points-and-publisher) | -| vcard:Kind | vcard:fn | extra:contact_name | maintainer, author | text | | -| vcard:Kind | vcard:hasEmail | extra:contact_email | maintainer_email, author_email | text | | -| dcat:Dataset | dcat:distribution | resources | | text | | -| dcat:Distribution | - | resource:uri | | text | See [URIs](#uris-1) | -| dcat:Distribution | dct:title | resource:name | | text | | -| dcat:Distribution | dcat:accessURL | resource:access_url | resource:url | text | If downloadURL is not present, accessURL will be used as resource url | -| dcat:Distribution | dcat:downloadURL | resource:download_url | | text | If present, downloadURL will be used as resource url | -| dcat:Distribution | dct:description | resource:description | | text | | -| dcat:Distribution | dcat:mediaType | resource:mimetype | | text | | -| dcat:Distribution | dct:format | resource:format | | text | | -| dcat:Distribution | dct:license | resource:license | | text | See [Licenses](#licenses) | -| dcat:Distribution | adms:status | resource:status | | text | | -| dcat:Distribution | dcat:byteSize | resource:size | | number | | -| dcat:Distribution | dct:issued | resource:issued | created | text | | -| dcat:Distribution | dct:modified | resource:modified | metadata_modified | text | | -| dcat:Distribution | dct:rights | resource:rights | | text | | -| dcat:Distribution | foaf:page | resource:documentation | | list | See [Lists](#lists) | -| dcat:Distribution | dct:language | resource:language | | list | See [Lists](#lists) | -| dcat:Distribution | dct:conformsTo | resource:conforms_to | | list | See [Lists](#lists) | -| dcat:Distribution | dcatap:availability | resource:availability | | text | | -| dcat:Distribution | dcat:compressFormat | resource:compress_format | | text | | -| dcat:Distribution | dcat:packageFormat | resource:package_format | | text | | -| dcat:Distribution | dcat:accessService | resource:access_services | | text | | -| dcat:DataService | dct:title | access_service:title | | text | | -| dcat:DataService | dcat:endpointURL | access_service:endpoint_url | | list | | -| dcat:DataService | dcat:endpointDescription| access_service:endpoint_description | | text | | -| dcat:DataService | dcatap:availability | access_service:availability | | text | | -| dcat:DataService | dcat:servesDataset | access_service:serves_dataset | | list | | -| dcat:DataService | dct:description | access_service:description | | text | | -| dcat:DataService | dct:license | access_service:license | | text | | -| dcat:DataService | dct:accessRights | access_service:access_rights | | text | | -| spdx:Checksum | spdx:checksumValue | resource:hash | | text | | -| spdx:Checksum | spdx:algorithm | resource:hash_algorithm | | text | | - -*Notes* - -### Custom fields - -Fields marked as `extra:` are stored as free form extras in the `euro_dcat_ap` and `euro_dcat_ap_2` profiles, -but stored as first level custom fields when using the scheming based profile (`euro_dcat_ap_scheming`), i.e: - - ```json - { - "name": "test_dataset_dcat", - "extras": [ - {"key": "version_notes", "value": "Some version notes"} - ] - } - ``` - - vs: - - ```json - { - "name": "test_dataset_dcat", - "version_notes": "Some version notes" - } - ``` - -### URIs - -Whenever possible, URIs are extracted and stored so there is a clear reference to the original RDF resource. -For instance: - - ```xml - - - - - Dataset 1 - - - Publishing Organization for dataset 1 - - - - - - ``` - - ```json - { - "title": "Dataset 1", - "extras": [ - {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"}, - {"key": "publisher_uri", "value": "http://orgs.vocab.org/some-org"}, - {"key": "publisher_name", "value": "Publishing Organization for dataset 1"} - ] - } - ``` - - Another example: - - ``` - @prefix dcat: . - @prefix dct: . - @prefix rdf: . - - - a dcat:Dataset ; - dct:title "Dataset 1" ; - dcat:distribution - . - - - - a dcat:Distribution ; - dct:title "Distribution for dataset 1" ; - dcat:accessURL . - ``` - - ```json - { - "title": "Dataset 1", - "extras": [ - {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} - ], - "resources": [{ - "name": "Distribution for dataset 1", - "url": "http://data.some.org/catalog/datasets/1/downloads/1.csv", - "uri": "http://data.some.org/catalog/datasets/1/d/1" - }] - } - ``` - -### Lists - -On the legacy profiles, lists are stored as a JSON string, eg: - - ``` - @prefix dcat: . - @prefix dct: . - @prefix rdf: . - - - a dcat:Dataset ; - dct:title "Dataset 1" ; - dct:language "ca" , "en" , "es" ; - dcat:theme "http://eurovoc.europa.eu/100142" , "http://eurovoc.europa.eu/209065", "Earth Sciences" ; - ``` - - ```json - { - "title": "Dataset 1", - "extras": [ - {"key": "uri", "value": "http://data.some.org/catalog/datasets/1"} - {"key": "language", "value": "[\"ca\", \"en\", \"es\"]"} - {"key": "theme", "value": "[\"Earth Sciences\", \"http://eurovoc.europa.eu/209065\", \"http://eurovoc.europa.eu/100142\"]"} - ], - } - ``` - -On the scheming-based ones, these are shown as actual lists: - - ```json - { - "title": "Dataset 1", - "uri": "http://data.some.org/catalog/datasets/1"}, - "language": ["ca", "en", "es"] - "theme": ["Earth Sciences", "http://eurovoc.europa.eu/209065", "http://eurovoc.europa.eu/100142"] - } - ``` -### Contact points and Publisher - -Properties for `dcat:contactPoint` and `dct:publisher` are stored as namespaced extras in the legacy profiles. When using -a scheming-based profile, these are stored as proper objects (and multiple instances are allowed for contact point): - -```json -{ - "name": "test_dataset_dcat", - "title": "Test dataset DCAT", - "extras": [ - {"key":"contact_name","value":"PointofContact"}, - {"key":"contact_email","value":"contact@some.org"} - ], -} -``` - -vs: - -```json -{ - "name": "test_dataset_dcat", - "title": "Test dataset DCAT", - "contact": [ - { - "name": "Point of Contact 1", - "email": "contact1@some.org" - }, - { - "name": "Point of Contact 2", - "email": "contact2@some.org" - }, - ] -} -``` - -If no `publisher` or `publisher_*` fields are found, the serializers will fall back to getting the publisher properties from the organization the CKAN dataset belongs to. The organization schema can be customized with the schema located in `ckanext/dcat/schemas/publisher_organization.yaml` to provide the extra properties supported (this will additionally require loading the `scheming_organizations` plugin in `ckan.plugins`). - - -### Spatial coverage - - -The following formats for `dct:spatial` are supported by the default [parser](#rdf-dcat-parser). Note that the default [serializer](#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. - - - One `dct:spatial` instance, URI only - - ```xml - - ``` - - - One `dct:spatial` instance with text (this should not be used anyway) - - ```xml - Newark - ``` - - - One `dct:spatial` instance with label and/or geometry - - ```xml - - - - {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} - - - POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) - - Newark - - - ``` - - - Multiple `dct:spatial` instances (as in GeoDCAT-AP) - - ```xml - - - - - {"type": "Polygon", "coordinates": [[[175.0, 17.5], [-65.5, 17.5], [-65.5, 72.0], [175.0, 72.0], [175.0, 17.5]]]} - - - POLYGON ((175.0000 17.5000, -65.5000 17.5000, -65.5000 72.0000, 175.0000 72.0000, 175.0000 17.5000)) - - - - - - Newark - - - ``` -If the RDF provides them, profiles should store the textual and geometric representation of the location in: - -* For legacy profiles in `spatial_text`, `spatial_bbox`, `spatial_centroid` or `spatial` (for any other geometries) extra fields -* For scheming-based profiles in objects in the `spatial_coverage` field, for instance: - -```json -{ - "name": "test_dataset_dcat", - "title": "Test dataset DCAT", - "spatial_coverage": [ - { - "geom": { - "type": "Polygon", - "coordinates": [...] - }, - "text": "Tarragona", - "uri": "https://sws.geonames.org/6361390/", - "bbox": { - "type": "Polygon", - "coordinates": [ - [ - [-2.1604, 42.7611], - [-2.0938, 42.7611], - [-2.0938, 42.7931], - [-2.1604, 42.7931], - [-2.1604, 42.7611], - ] - ], - }, - "centroid": {"type": "Point", "coordinates": [1.26639, 41.12386]}, - } - ] -} -``` - - -### Licenses - -On the CKAN model, license is at the dataset level whereas in DCAT model it - is at distributions level. By default the RDF parser will try to find a - distribution with a license that matches one of those registered in CKAN - and attach this license to the dataset. The first matching distribution's - license is used, meaning that any discrepancy accross distributions license - will not be accounted for. This behavior can be customized by overridding the - `_license` method on a custom profile. - - -## RDF DCAT Parser - -The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different -formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions -and create CKAN datasets and resources, as dictionaries that can be passed to [`package_create`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.create.package_create) or [`package_update`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.update.package_update). - -Here is a quick overview of how it works: - -```python - - from ckanext.dcat.processors import RDFParser, RDFParserException - - parser = RDFParser() - - # Parsing a local RDF/XML file - - with open('datasets.rdf', 'r') as f: - try: - parser.parse(f.read()) - - for dataset in parser.datasets(): - print('Got dataset with title {0}'.format(dataset['title']) - - except RDFParserException, e: - print ('Error parsing the RDF file: {0}'.format(e)) - - # Parsing a remote JSON-LD file - - import requests - - parser = RDFParser() - - content = requests.get('https://some.catalog.org/datasets.jsonld').content - - try: - parser.parse(content, _format='json-ld') - - for dataset in parser.datasets(): - print('Got dataset with title {0}'.format(dataset['title']) - - except RDFParserException, e: - print ('Error parsing the RDF file: {0}'.format(e)) - -``` - -The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Python library for working with RDF. Any -RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains -serializations in different formats including RDF/XML, Turtle or JSON-LD. - -## RDF DCAT Serializer - -The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different -formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). - -Here is an example of how to use it: - -```python - - from ckanext.dcat.processors import RDFSerializer - - # Serializing a single dataset - - dataset = get_action('package_show')({}, {'id': 'my-dataset'}) - - serializer = RDFserializer() - - dataset_ttl = serializer.serialize_dataset(dataset, _format='turtle') - - - # Serializing the whole catalog (or rather part of it) - - datasets = get_action('package_search')({}, {'q': '*:*', 'rows': 50}) - - serializer = RDFserializer() - - catalog_xml = serializer.serialize_catalog({'title': 'My catalog'}, - dataset_dicts=datasets, - _format='xml') - - # Creating and RDFLib graph from a single dataset - - dataset = get_action('package_show')({}, {'id': 'my-dataset'}) - - serializer = RDFserializer() - - dataset_reference = serializer.graph_from_dataset(dataset) - - # serializer.g now contains the full dataset graph, an RDFLib Graph class - -``` - -The serializer uses customizable [profiles](#profiles) to generate an RDF graph (an [RDFLib Graph class](https://rdflib.readthedocs.org/en/latest/apidocs/rdflib.html#rdflib.graph.Graph)). -By default these use the [mapping](#rdf-dcat-to-ckan-dataset-mapping) described in the previous section. - -In some cases, if the default CKAN field that maps to a DCAT property is not present, some other fallback -values will be used instead. For instance, if the `contact_email` field is not found, `maintainer_email` -and `author_email` will be used (if present) for the email property of the `adms:contactPoint` property. - -Note that the serializer will look both for a first level field or an extra field with the same key, ie both -the following values will be used for `dct:accrualPeriodicity`: - - { - "name": "my-dataset", - "frequency": "monthly", - ... - } - - { - "name": "my-dataset", - "extras": [ - {"key": "frequency", "value": "monthly"}, - ] - ... - } - -Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), -so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). - -### Inherit license from the dataset as fallback in distributions -It is possible to inherit the license from the dataset to the distributions, but only if there is no license defined in the resource yet. By default the license is not inherited from the dataset. This can be activated by setting the following parameter in the CKAN config file: - - ckanext.dcat.resource.inherit.license = True - - -## Profiles - -Both the parser and the serializer use profiles to allow customization of how the values defined in the RDF graph are mapped to CKAN and viceversa. - -Profiles define : - -* How the RDF graph values map into CKAN fields, ie how the RDF is parsed into CKAN datasets -* How CKAN fields map to an RDF graph, which can be then serialized -* How the CKAN catalog metadata maps to an RDF graph, which can be then serialized - -They essentially define the mapping between DCAT and CKAN. - -In most cases the default profile will provide a good mapping that will cover most properties described in the DCAT standard. If you want to extract extra fields defined in the RDF, are using a custom schema or -need custom logic, you can write a custom to profile that extends or replaces the default one. - -The profiles currently shipped with the extension are mostly based in the -[DCAT application profile for data portals in Europe](https://joinup.ec.europa.eu/asset/dcat_application_profile/description). As mentioned before though, they should be generic enough for most DCAT based representations. - -Sites that want to support a particular version of the DCAT-AP can enable a specific profile using one of the profiles below: - -* [DCAT-AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0) (default): `euro_dcat_ap_3` -* [DCAT-AP v2.1.0](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210): `euro_dcat_ap_2` -* [DCAT-AP v1.1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11): `euro_dcat_ap` - -This plugin also contains a profile to serialize a CKAN dataset to a [schema.org Dataset](http://schema.org/Dataset) called `schemaorg`. This is especially useful to provide [JSON-LD structured data](#structured-data). - -To define which profiles to use you can: - -1. Set the [`ckanext.dcat.rdf.profiles`](configuration.md#ckanextdcatrdfprofiles) configuration option on your CKAN configuration file: - - ckanext.dcat.rdf.profiles = euro_dcat_ap sweden_dcat_ap - -2. When initializing a parser or serializer class, pass the profiles to be used as a parameter, eg: - -```python - - parser = RDFParser(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) - - serializer = RDFSerializer(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) -``` - -Note that in both cases the order in which you define them is important, as it will be the one that the profiles will be run on. - - -### Writing custom profiles - -Internally, profiles are classes that define a particular set of methods called during the parsing process. -For instance, the `parse_dataset` method is called on each DCAT dataset found when parsing an RDF file, and should return a CKAN dataset. -Conversely, the `graph_from_dataset` will be called when requesting an RDF representation for a dataset, and will need to generate the necessary RDF graph. - -Custom profiles should always extend the `ckanext.dcat.profiles.RDFProfile` class. This class has several helper -functions to make getting metadata from the RDF graph easier. These include helpers for getting fields for FOAF and VCard entities like the ones -used to define publishers or contact points. Check the source code of `ckanex.dcat.profiles.py` to see what is available. - -Profiles can extend other profiles to avoid repeating rules, or can be completely independent. - -The following example shows a complete example of a profile built on top of the European DCAT-AP profile (`euro_dcat_ap`): - -```python - - from rdflib.namespace import Namespace - from ckanext.dcat.profiles import RDFProfile - - DCT = Namespace("http://purl.org/dc/terms/") - - - class SwedishDCATAPProfile(RDFProfile): - ''' - An RDF profile for the Swedish DCAT-AP recommendation for data portals - - It requires the European DCAT-AP profile (`euro_dcat_ap`) - ''' - - def parse_dataset(self, dataset_dict, dataset_ref): - - # Spatial label - spatial = self._object(dataset_ref, DCT.spatial) - if spatial: - spatial_label = self.g.label(spatial) - if spatial_label: - dataset_dict['extras'].append({'key': 'spatial_text', - 'value': str(spatial_label)}) - - return dataset_dict - - def graph_from_dataset(self, dataset_dict, dataset_ref): - - g = self.g - - spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') - spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') - - if spatial_uri: - spatial_ref = URIRef(spatial_uri) - else: - spatial_ref = BNode() - - if spatial_text: - g.add((dataset_ref, DCT.spatial, spatial_ref)) - g.add((spatial_ref, RDF.type, DCT.Location)) - g.add((spatial_ref, RDFS.label, Literal(spatial_text))) -``` - -Note how the dataset dict is passed between profiles so it can be further tweaked. - -Extensions define their available profiles using the `ckan.rdf.profiles` in the `setup.py` file, as in this [example](https://github.com/ckan/ckanext-dcat/blob/cc5fcc7be0be62491301db719ce597aec7c684b0/setup.py#L37:L38) from this same extension: - - [ckan.rdf.profiles] - euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile - euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile - schemaorg=ckanext.dcat.profiles:SchemaOrgProfile - -## XML DCAT harvester (deprecated) - -The old DCAT XML harvester (`dcat_xml_harvester`) is now deprecated, in favour of the [RDF harvester](#rdf-dcat-harvester). -Loading it on the ini file will result in an exception on startup. - -The XML serialization described in the [spec.datacatalogs.org](http://spec.datacatalogs.org/#datasets_serialization_format) site is a valid RDF/XML one, so changing the harvester should have no effect. There might be slight differences in the way CKAN fields are created though, check [Compatibility mode](#compatibility-mode) for more details. - -## Translation of fields - -The `dcat` plugin automatically translates the keys of the dcat fields used in the frontend. -This makes it very easy to display the fields in the current language. - -To disable this behavior, you can set the following config value in your ini file (default: True): - - ckanext.dcat.translate_keys = False - - -## Structured data and Google Dataset Search indexing - -There are plugins available to add [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) to dataset pages to provide richer metadata for search engines crawling your site. One of the most well known is [Google Dataset Search](https://toolbox.google.com/datasetsearch). The `structured_data` plugin will add the necessary markup in order to get your datasets indexed by Google Dataset Search. This markup is a JSON-LD snippet that uses the [schema.org](https://schema.org) vocabulary to describe the dataset. - -To add [structured data](https://developers.google.com/search/docs/guides/intro-structured-data) to dataset pages, activate the `structured_data` and `dcat` plugins in your ini file: - - ckan.plugins = dcat structured_data - -By default this uses the `schemaorg` profile (see [profiles](#profiles)) to serialize the dataset to JSON-LD, which is then added to the dataset detail page. -To change the schema, you have to override the Jinja template block called `structured_data` in [`templates/package/read_base.html`](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/templates/package/read_base.html) and call the template helper function with different parameters: - - {% block structured_data %} - - {% endblock %} - -Example output of structured data in JSON-LD: - - < ... > - - - - - -## CLI - -The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and -viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file: - - ckan dcat consume -f ttl examples/dcat/dataset.ttl - - ckan dcat produce -f jsonld examples/ckan/ckan_datasets.json - -or be read from stdin: - - ckan dcat consume - - -The latter form allows chaininig commands for more complex metadata processing, e.g.: - - curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld - - -For the full list of options check `ckan dcat consume --help` and `ckan dcat produce --help`. - -## Configuration reference - - - -### General settings - -#### ckanext.dcat.rdf.profiles - -Example: - -``` -ckanext.dcat.rdf.profiles = euro_dcat_ap_2 my_local_ap -``` - -Default value: `euro_dcat_ap_2` - -RDF profiles to use when parsing and serializing. See https://github.com/ckan/ckanext-dcat#profiles -for more details. - - -#### ckanext.dcat.translate_keys - -Default value: `True` - -If set to True, the plugin will automatically translate the keys of the DCAT -fields used in the frontend (at least those present in the `ckanext/dcat/i18n` -po files). - - -### Parsers / Serializers settings - -#### ckanext.dcat.output_spatial_format - -Default value: `wkt` - -Format to use for geometries when serializing RDF documents. The default is -recommended as is the format expected by GeoDCAT, alternatively you can -use `geojson` (or both, which will make SHACL validation fail) - - -#### ckanext.dcat.resource.inherit.license - -Default value: `False` - -If there is no license defined for a resource / distribution, inherit it from -the dataset. - - -#### ckanext.dcat.normalize_ckan_format - -Default value: `True` - -When true, the resource label will be tried to match against the standard -list of CKAN formats (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json) -This allows for instance to populate the CKAN resource format field -with a value that view plugins, etc will understand (`csv`, `xml`, etc.) - - -#### ckanext.dcat.clean_tags - -Default value: `False` - -Remove special characters from keywords (use the old munge_tag() CKAN function). -This is generally not needed. - - -### Endpoints settings - -#### ckanext.dcat.enable_rdf_endpoints - -Default value: `True` - -Whether to expose the catalog and dataset endpoints with the RDF DCAT -serializations. - -#### ckanext.dcat.base_uri - -Example: - -``` -https://my-site.org/uris/ -``` - -Base URI to use when generating URIs for all entities. It needs to be a valid URI value. - -#### ckanext.dcat.catalog_endpoint - -Example: - -``` -ckanext.dcat.catalog_endpoint = /dcat/catalog/{_format} -``` - -Default value: `/catalog.{_format}` - -Custom route for the catalog endpoint. It should start with `/` and include the -`{_format}` placeholder. - - -#### ckanext.dcat.datasets_per_page - -Default value: `100` - -Default number of datasets returned by the catalog endpoint. - - -#### ckanext.dcat.enable_content_negotiation - -Default value: `False` - -Enable content negotiation in the main catalog and dataset endpoints. Note that -setting this to True overrides the core `home.index` and `dataset.read` endpoints. - - -### Harvester settings - -#### ckanext.dcat.max_file_size - -Default value: `50` - -Maximum file size that will be downloaded for parsing by the harvesters - - -#### ckanext.dcat.expose_subcatalogs - -Default value: `False` - -Store information about the origin catalog when harvesting datasets. -See https://github.com/ckan/ckanext-dcat#transitive-harvesting for more details. - - -### Deprecated options (will be removed in future versions) - -#### ckanext.dcat.compatibility_mode - -Default value: `False` - -Whether to modify some fields to maintain compatibility with previous versions -of the ckanext-dcat parsers. - - -#### ckanext.dcat.json_endpoint +* A base [mapping](https://docs.ckan.org/projects/ckanext-dcat/en/latest/mapping) between DCAT and CKAN datasets and viceversa (compatible with [DCAT-AP v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), [DCAT-AP v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210) and [DCAT-AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/)). -Default value: `/dcat.json` +* An [RDF Parser](https://docs.ckan.org/projects/ckanext-dcat/en/latest/profiles#rdf-dcat-parser) that allows to read RDF serializations in different formats and extract CKAN dataset dicts, using customizable [profiles](https://docs.ckan.org/projects/ckanext-dcat/en/latest/profiles#profiles). -Custom route to expose the legacy JSON endpoint +* An [RDF Serializer](https://docs.ckan.org/projects/ckanext-dcat/en/latest/profiles#rdf-dcat-serializer) that allows to transform CKAN datasets metadata to different semantic formats, also allowing customizable [profiles](https://docs.ckan.org/projects/ckanext-dcat/en/latest/profiles#profiles). - ## Running the Tests @@ -1259,7 +54,7 @@ To create a new release, follow these steps: ## Acknowledgements -Work on ckanext-dcat has been made possible by: +Work on ckanext-dcat has been made possible in part by: * the Government of Sweden and Vinnova, as part of work on [Öppnadata.se](http://oppnadata.se), the Swedish Open Data Portal. * [FIWARE](https://www.fiware.org), a project funded by the European Commission to integrate different technologies to offer connected cloud services from a single platform. From eb8579071c5a4baa57c721ee441efdf0394025a1 Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 30 Aug 2024 12:22:19 +0200 Subject: [PATCH 8/8] Include changelog in documentation --- CHANGELOG.md | 199 +++++++++++++++++++++++----------------------- docs/changelog.md | 1 + mkdocs.yml | 5 +- 3 files changed, 103 insertions(+), 102 deletions(-) create mode 100644 docs/changelog.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ea2c482f..3f9b7f9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,164 +12,161 @@ so by adding the `euro_dcat_ap_scheming` profile at the end of their profile chain (e.g. `ckanext.dcat.rdf.profiles = euro_dcat_ap_2 euro_dcat_ap_scheming`), which will modify the existing profile outputs to the expected format by the scheming validators. Sample schemas are provided - in the `ckanext/dcat/schemas` folder. See the [documentation](https://github.com/ckan/ckanext-dcat?tab=readme-ov-file#schemas) - for all details. Some highlights of the new scheming based profiles: + in the `ckanext/dcat/schemas` folder. See the [documentation](getting-started.md#schemas) + for all details. Some highlights of the new scheming based profiles ([#281](https://github.com/ckan/ckanext-dcat/pull/281)): * Actual list support in the API output for list properties like `dct:language` * Multiple objects now allowed for properties like `dcat:ContactPoint`, `dct:spatial` or `dct:temporal` * Custom validators for date values that allow `xsd:gYear`, `xsd:gYearMonth`, `xsd:date` and `xsd:dateTime` - - (#281) + * [SHACL validation](https://github.com/SEMICeu/DCAT-AP/tree/master/releases/2.1.1) for DCAT-AP 2.1.1 profile (scheming and legacy). - SHACL validation made surface the following issues in the existing profiles, which are now fixed: - * Cast `dcat:byteSize` and `dcat:spatialResolutionInMeters` as Decimal, not float - * Allow only one value of `dcat:spatialResolutionInMeters` and `dcat:temporalResolution` - * Only output the WKT version of geometries in `locn:geometry`, `dcat:bbox` and `dcat:centroid`. Sites that for some reason - require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` config option - to choose which format to use - * When using the `euro_dcat_ap_2` profile, don't output temporal extent namespaced + SHACL validation made surface the following issues in the existing profiles, which are now fixed ([#288](https://github.com/ckan/ckanext-dcat/pull/288)): + * Cast `dcat:byteSize` and `dcat:spatialResolutionInMeters` as Decimal, not float + * Allow only one value of `dcat:spatialResolutionInMeters` and `dcat:temporalResolution` + * Only output the WKT version of geometries in `locn:geometry`, `dcat:bbox` and `dcat:centroid`. Sites that for some reason + require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` config option + to choose which format to use + * When using the `euro_dcat_ap_2` profile, don't output temporal extent namespaced both with `schema` and `dcat`, just with the latter (`dcat:startDate` and `dcat:endDate`) - (#288) -* New `ckan dcat consume` and `ckan dcat produce` CLI commands (#279) -* Parse dcat:spatialResolutionInMeters as float (#285) -* Split profile classes into their own separate files (#282) -* Catch Not Authorized in View (#280) -* CKAN 2.11 support and requirements updates (#270) +* CKAN 2.11 support and requirements updates ([#270](https://github.com/ckan/ckanext-dcat/pull/270)) +* New `ckan dcat consume` and `ckan dcat produce` CLI commands ([#279](https://github.com/ckan/ckanext-dcat/pull/279)) +* Revamped documentation, now hosted at [https://docs.ckan.org/projects/ckanext-dcat](https://docs.ckan.org/projects/ckanext-dcat) ([#296](https://github.com/ckan/ckanext-dcat/pull/296)) +* Parse dcat:spatialResolutionInMeters as float ([#285](https://github.com/ckan/ckanext-dcat/pull/285)) +* Split profile classes into their own separate files ([#282](https://github.com/ckan/ckanext-dcat/pull/282)) +* Catch Not Authorized in View ([#280](https://github.com/ckan/ckanext-dcat/pull/280)) ## [v1.7.0](https://github.com/ckan/ckanext-dcat/compare/v1.6.0...v1.7.0) - 2024-04-04 -* Adds support for the latest Hydra vocabulary. For backward compatibility, the old properties are still supported but marked as deprecated. (#267) +* Adds support for the latest Hydra vocabulary. For backward compatibility, the old properties are still supported but marked as deprecated. ([#267](https://github.com/ckan/ckanext-dcat/pull/267)) ## [v1.6.0](https://github.com/ckan/ckanext-dcat/compare/v1.5.1...v1.6.0) - 2024-02-29 -* Add support for `DCATAP.applicableLegislation` and `DCATAP.hvdCategory` to the `euro_dcat_ap_2` profile (#262) -* Improve access service tests (#258) -* Fix missing access service items when parsing dataset (#256) +* Add support for `DCATAP.applicableLegislation` and `DCATAP.hvdCategory` to the `euro_dcat_ap_2` profile ([#262](https://github.com/ckan/ckanext-dcat/pull/262)) +* Improve access service tests ([#258](https://github.com/ckan/ckanext-dcat/pull/258)) +* Fix missing access service items when parsing dataset ([#256](https://github.com/ckan/ckanext-dcat/pull/256)) ## [v1.5.1](https://github.com/ckan/ckanext-dcat/compare/v1.5.0...v1.5.1) - 2023-06-20 -* Fix tests to work with `ckanext-harvest >= 1.5.4`. (#250) -* Add references for dcat:accessService to the `euro_dcat_ap_2` profile (#251) +* Fix tests to work with `ckanext-harvest >= 1.5.4`. ([#250](https://github.com/ckan/ckanext-dcat/pull/250)) +* Add references for dcat:accessService to the `euro_dcat_ap_2` profile ([#251](https://github.com/ckan/ckanext-dcat/pull/251)) ## [v1.5.0](https://github.com/ckan/ckanext-dcat/compare/v1.4.0...v1.5.0) - 2023-05-02 -* Remove support for old CKAN versions prior 2.9 and Python 2 (#244) -* Update hooks to support CKAN 2.10 (#241) -* Fix description for RDF endpoints in README (#246) -* Fix media type for links to the Turtle representation in HTML templates (#242) -* Ignore already deleted packages when deleting (#238) -* Add support for dcat:accessService in dcat:Distribution (#235) +* Remove support for old CKAN versions prior 2.9 and Python 2 ([#244](https://github.com/ckan/ckanext-dcat/pull/244)) +* Update hooks to support CKAN 2.10 ([#241](https://github.com/ckan/ckanext-dcat/pull/241)) +* Fix description for RDF endpoints in README ([#246](https://github.com/ckan/ckanext-dcat/pull/246)) +* Fix media type for links to the Turtle representation in HTML templates ([#242](https://github.com/ckan/ckanext-dcat/pull/242)) +* Ignore already deleted packages when deleting ([#238](https://github.com/ckan/ckanext-dcat/pull/238)) +* Add support for dcat:accessService in dcat:Distribution ([#235](https://github.com/ckan/ckanext-dcat/pull/235)) ## [v1.4.0](https://github.com/ckan/ckanext-dcat/compare/v1.3.0...v1.4.0) - 2022-12-05 -* RDF serialization: Add fallback values for resource dates (#233) -* Add option for fallback distribution license if missing (#231) +* RDF serialization: Add fallback values for resource dates ([#233](https://github.com/ckan/ckanext-dcat/pull/233)) +* Add option for fallback distribution license if missing ([#231](https://github.com/ckan/ckanext-dcat/pull/231)) ## [v1.3.0](https://github.com/ckan/ckanext-dcat/compare/v1.2.0...v1.3.0) - 2022-08-01 -* Fix assert expressions in tests (#218) -* Fix unicode encoding error on Python 2 (#225) -* Support (partial, not complete) for DCAT-AP 2.1 (#220) - -#### Changed default profile -With #220 the default profile has changed from `euro_dcat_ap` to `euro_dcat_ap_2`. The following properties are additionally supported by default: -* dcat:Dataset - * dcat:bbox und dcat:centroid (in dct:Location of dct:spatial) - * dcat:startDate, dcat:endDate, time:hasBeginning, time:hasEnd (in dct:PeriodOfTime of dct:temporal) - * dcat:spatialResolutionInMeters - * dcat:temporalResolution - * dct:isReferencedBy -* dcat:Distribution - * dcatap:availability - * dcat:compressFormat - * dcat:packageFormat - -How the default profile can be changed is described in the Documentation under [profiles](https://github.com/ckan/ckanext-dcat/#profiles). +* Fix assert expressions in tests ([#218](https://github.com/ckan/ckanext-dcat/pull/218)) +* Fix unicode encoding error on Python 2 ([#225](https://github.com/ckan/ckanext-dcat/pull/225)) +* Support (partial, not complete) for DCAT-AP 2.1 ([#220](https://github.com/ckan/ckanext-dcat/pull/220)). The following properties are additionally supported by default: + * dcat:Dataset + * dcat:bbox and dcat:centroid (in dct:Location of dct:spatial) + * dcat:startDate, dcat:endDate, time:hasBeginning, time:hasEnd (in dct:PeriodOfTime of dct:temporal) + * dcat:spatialResolutionInMeters + * dcat:temporalResolution + * dct:isReferencedBy + * dcat:Distribution + * dcatap:availability + * dcat:compressFormat + * dcat:packageFormat + +!!! Note "Changed default profile" + With ([#220](https://github.com/ckan/ckanext-dcat/pull/220)) the default profile has changed from `euro_dcat_ap` to `euro_dcat_ap_2`. ## [v1.2.0](https://github.com/ckan/ckanext-dcat/compare/v1.1.3...v1.2.0) - 2022-05-25 -* Support for CKAN 2.10 and Python 3.9 (#208) -* Upgrade RDFLib version (#213) -* Support URIs in more fields of the default profile (#214) -* Make HTTP-Response size configurable (#215) -* Increase harvester get content chunk size (#217) +* Support for CKAN 2.10 and Python 3.9 ([#208](https://github.com/ckan/ckanext-dcat/pull/208)) +* Upgrade RDFLib version ([#213](https://github.com/ckan/ckanext-dcat/pull/213)) +* Support URIs in more fields of the default profile ([#214](https://github.com/ckan/ckanext-dcat/pull/214)) +* Make HTTP-Response size configurable ([#215](https://github.com/ckan/ckanext-dcat/pull/215)) +* Increase harvester get content chunk size ([#217](https://github.com/ckan/ckanext-dcat/pull/217)) ## [v1.1.3](https://github.com/ckan/ckanext-dcat/compare/v1.1.3...v1.1.2) - 2021-11-05 -* Fix behavior if `publisher_uri` is not available (#201) -* Also process URIRef in rights statements (#200) +* Fix behavior if `publisher_uri` is not available ([#201](https://github.com/ckan/ckanext-dcat/pull/201)) +* Also process URIRef in rights statements ([#200](https://github.com/ckan/ckanext-dcat/pull/200)) ## [v1.1.2](https://github.com/ckan/ckanext-dcat/compare/v1.1.2...v1.1.1) - 2021-06-22 -* Use safer encoder for Structured Data output (#198) -* Fix: use catalog_uri logic for pagination URIs (#197) -* Introduce new interface method `after_parsing` (#196) +* Use safer encoder for Structured Data output ([#198](https://github.com/ckan/ckanext-dcat/pull/198)) +* Fix: use catalog_uri logic for pagination URIs ([#197](https://github.com/ckan/ckanext-dcat/pull/197)) +* Introduce new interface method `after_parsing` ([#196](https://github.com/ckan/ckanext-dcat/pull/196)) ## [v1.1.1](https://github.com/ckan/ckanext-dcat/compare/v1.1.0...v1.1.1) - 2021-03-17 -* Fix harvest encoding error on py3 (#189) -* Fix py3 syntax error (#184) -* Fixed Internal server error on login (#181) -* Remove Beautifulsoup requirement (#195) +* Fix harvest encoding error on py3 ([#189](https://github.com/ckan/ckanext-dcat/pull/189)) +* Fix py3 syntax error ([#184](https://github.com/ckan/ckanext-dcat/pull/184)) +* Fixed Internal server error on login ([#181](https://github.com/ckan/ckanext-dcat/pull/181)) +* Remove Beautifulsoup requirement ([#195](https://github.com/ckan/ckanext-dcat/pull/195)) * Migrate tests to GitHub Actions ## [v1.1.0](https://github.com/ckan/ckanext-dcat/compare/v1.0.0...v1.1.0) - 2020-03-12 -* Python 3 support and new pytest based test suite (#174) -* Fix `after_show - set_titles` in plugins.py (#172) -* Add support for DCT.rightsStatement in DCT.accessRights and DCT.rights (#177) -* Add support for additional vcard representations (#178) -* Fix format normalization configuration (#175) -* Introduce the possibility to modify package update/create schema (#176) +* Python 3 support and new pytest based test suite ([#174](https://github.com/ckan/ckanext-dcat/pull/174)) +* Fix `after_show - set_titles` in plugins.py ([#172](https://github.com/ckan/ckanext-dcat/pull/172)) +* Add support for DCT.rightsStatement in DCT.accessRights and DCT.rights ([#177](https://github.com/ckan/ckanext-dcat/pull/177)) +* Add support for additional vcard representations ([#178](https://github.com/ckan/ckanext-dcat/pull/178)) +* Fix format normalization configuration ([#175](https://github.com/ckan/ckanext-dcat/pull/175)) +* Introduce the possibility to modify package update/create schema ([#176](https://github.com/ckan/ckanext-dcat/pull/176)) ## [v1.0.0](https://github.com/ckan/ckanext-dcat/compare/v0.0.9...v1.0.0) - 2019-11-07 -* Updating the URLs to dataportals.org (#145) -* Handle import stage errors (#149) -* Pass `q` and `fq` parameters in catalog endpoint (#152) -* Include templates in package (#154) -* Ignore auth in internal search call (#156) -* Support URIRef for dct:language (#158) -* Support JSON-LD catalogs with @graph (#159) -* Make read keywords re-usable (#160) -* Extract read datasets from db to make it re-usable (#161) +* Updating the URLs to dataportals.org ([#145](https://github.com/ckan/ckanext-dcat/pull/145)) +* Handle import stage errors ([#149](https://github.com/ckan/ckanext-dcat/pull/149)) +* Pass `q` and `fq` parameters in catalog endpoint ([#152](https://github.com/ckan/ckanext-dcat/pull/152)) +* Include templates in package ([#154](https://github.com/ckan/ckanext-dcat/pull/154)) +* Ignore auth in internal search call ([#156](https://github.com/ckan/ckanext-dcat/pull/156)) +* Support URIRef for dct:language ([#158](https://github.com/ckan/ckanext-dcat/pull/158)) +* Support JSON-LD catalogs with @graph ([#159](https://github.com/ckan/ckanext-dcat/pull/159)) +* Make read keywords re-usable ([#160](https://github.com/ckan/ckanext-dcat/pull/160)) +* Extract read datasets from db to make it re-usable ([#161](https://github.com/ckan/ckanext-dcat/pull/161)) ## [v0.0.9](https://github.com/ckan/ckanext-dcat/compare/v0.0.8...v0.0.9) - 2019-01-10 -* Make _object_value_int more robust by accepting decimals as well (#133) -* Prefer default language values for some Literal nodes (#143) -* Improved dct:format and dcat:mediaType handling (#144) -* Assign URIRef or Literal types based on content (#140) +* Make _object_value_int more robust by accepting decimals as well ([#133](https://github.com/ckan/ckanext-dcat/pull/133)) +* Prefer default language values for some Literal nodes ([#143](https://github.com/ckan/ckanext-dcat/pull/143)) +* Improved dct:format and dcat:mediaType handling ([#144](https://github.com/ckan/ckanext-dcat/pull/144)) +* Assign URIRef or Literal types based on content ([#140](https://github.com/ckan/ckanext-dcat/pull/140)) ## [v0.0.8](https://github.com/ckan/ckanext-dcat/compare/v0.0.7...v0.0.8) - 2018-10-05 * Support for CKAN >= 2.8 -* Schema.org mapping improvements (#120, #139) -* Fix handling of downloadURL and accessURL (#130) +* Schema.org mapping improvements ([#120, #139](https://github.com/ckan/ckanext-dcat/pull/120, #139)) +* Fix handling of downloadURL and accessURL ([#130](https://github.com/ckan/ckanext-dcat/pull/130)) * Improve support for custom schemas when generating guids -* Improvements and refactoring of data.json harvester (#116) -* Add RDF.type to resource checksum (#132) -* Improve email addresses handling (#134) -* Escape and clean URL references (#138) +* Improvements and refactoring of data.json harvester ([#116](https://github.com/ckan/ckanext-dcat/pull/116)) +* Add RDF.type to resource checksum ([#132](https://github.com/ckan/ckanext-dcat/pull/132)) +* Improve email addresses handling ([#134](https://github.com/ckan/ckanext-dcat/pull/134)) +* Escape and clean URL references ([#138](https://github.com/ckan/ckanext-dcat/pull/138)) ## [v0.0.7](https://github.com/ckan/ckanext-dcat/compare/v0.0.6...v0.0.7) - 2018-02-16 -* Support for embedding Schema.org structured data in dataset pages (#75) -* Improve the error handling in the harvesting gather and import stage (#95) -* Avoid resource re-creation on harvesting (#91) -* Infer dataset licence from the distribution ones (#42) -* Interface for requests Session in harvesters (#98) -* Support for transitive harvesting (#96) -* Support fot cleaning tags in harvester (#103) +* Support for embedding Schema.org structured data in dataset pages ([#75](https://github.com/ckan/ckanext-dcat/pull/75)) +* Improve the error handling in the harvesting gather and import stage ([#95](https://github.com/ckan/ckanext-dcat/pull/95)) +* Avoid resource re-creation on harvesting ([#91](https://github.com/ckan/ckanext-dcat/pull/91)) +* Infer dataset licence from the distribution ones ([#42](https://github.com/ckan/ckanext-dcat/pull/42)) +* Interface for requests Session in harvesters ([#98](https://github.com/ckan/ckanext-dcat/pull/98)) +* Support for transitive harvesting ([#96](https://github.com/ckan/ckanext-dcat/pull/96)) +* Support fot cleaning tags in harvester ([#103](https://github.com/ckan/ckanext-dcat/pull/103)) ## [v0.0.6](https://github.com/ckan/ckanext-dcat/compare/v0.0.5...v0.0.6) - 2017-02-24 * Use Resources rather than Literals for dcat:landingPage, dcat:accessURL, - dcat:downloadURL, foaf:homepage, dcat:theme (#66) -* Support for pagination on RDF Harvester (#70) -* Add missing DCAT fields on the serialization, dct:type and dct:provenance (#71) -* Add MANIFEST.in to ensure i18n files are include in package (#76) -* Add before/after create/update hooks to IDCATRDFHarvester (#77) -* Fix serialization of numeric values (#73) + dcat:downloadURL, foaf:homepage, dcat:theme ([#66](https://github.com/ckan/ckanext-dcat/pull/66)) +* Support for pagination on RDF Harvester ([#70](https://github.com/ckan/ckanext-dcat/pull/70)) +* Add missing DCAT fields on the serialization, dct:type and dct:provenance ([#71](https://github.com/ckan/ckanext-dcat/pull/71)) +* Add MANIFEST.in to ensure i18n files are include in package ([#76](https://github.com/ckan/ckanext-dcat/pull/76)) +* Add before/after create/update hooks to IDCATRDFHarvester ([#77](https://github.com/ckan/ckanext-dcat/pull/77)) +* Fix serialization of numeric values ([#73](https://github.com/ckan/ckanext-dcat/pull/73)) diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 00000000..786b75d5 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1 @@ +--8<-- "CHANGELOG.md" diff --git a/mkdocs.yml b/mkdocs.yml index 918baea8..b23d142e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -55,6 +55,7 @@ markdown_extensions: - pymdownx.superfences - pymdownx.tabbed: alternate_style: true + - pymdownx.snippets extra_css: @@ -72,4 +73,6 @@ nav: - Google Dataset Search: 'google-dataset-search.md' - CLI: 'cli.md' - Configuration reference: 'configuration.md' - - CHANGELOG: 'https://github.com/ckan/ckanext-dcat/blob/master/CHANGELOG.md' +# - CHANGELOG: 'https://github.com/ckan/ckanext-dcat/blob/master/CHANGELOG.md' + + - CHANGELOG: 'changelog.md'