Skip to content

Commit

Permalink
Merge pull request #1833 from depositar/provider-ckan
Browse files Browse the repository at this point in the history
[MRG] Add a repo provider for CKAN datasets
  • Loading branch information
yuvipanda authored Jul 6, 2024
2 parents 0d5dccd + 2945d83 commit 497c2ac
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 3 deletions.
2 changes: 2 additions & 0 deletions binderhub/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from .ratelimit import RateLimiter
from .registry import DockerRegistry
from .repoproviders import (
CKANProvider,
DataverseProvider,
FigshareProvider,
GistRepoProvider,
Expand Down Expand Up @@ -586,6 +587,7 @@ def _default_build_namespace(self):
"figshare": FigshareProvider,
"hydroshare": HydroshareProvider,
"dataverse": DataverseProvider,
"ckan": CKANProvider,
},
config=True,
help="""
Expand Down
3 changes: 2 additions & 1 deletion binderhub/event-schemas/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
"Zenodo",
"Figshare",
"Hydroshare",
"Dataverse"
"Dataverse",
"CKAN"
],
"description": "Provider for the repository being launched"
},
Expand Down
1 change: 1 addition & 0 deletions binderhub/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"figshare": "Figshare",
"hydroshare": "Hydroshare",
"dataverse": "Dataverse",
"ckan": "CKAN",
}


Expand Down
88 changes: 87 additions & 1 deletion binderhub/repoproviders.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import time
import urllib.parse
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse
from urllib.parse import parse_qs, urlencode, urlparse

import escapism
from prometheus_client import Gauge
Expand Down Expand Up @@ -475,6 +475,92 @@ def get_build_slug(self):
return f"hydroshare-{self.record_id}"


class CKANProvider(RepoProvider):
"""Provide contents of a CKAN dataset
Users must provide a spec consisting of the CKAN dataset URL.
"""

name = Unicode("CKAN")

display_name = "CKAN dataset"

labels = {
"text": "CKAN dataset URL (https://demo.ckan.org/dataset/sample-dataset-1)",
"tag_text": "Git ref (branch, tag, or commit)",
"ref_prop_disabled": True,
"label_prop_disabled": True,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.repo = urllib.parse.unquote(self.spec)

async def get_resolved_ref(self):
parsed_repo = urlparse(self.repo)

if "/dataset/" not in parsed_repo.path:
# Not actually a dataset
return None

# CKAN may be under a URL prefix, and we should accomodate that
url_prefix, dataset_url = parsed_repo.path.split("/dataset/")

dataset_url_parts = dataset_url.split("/")
self.dataset_id = dataset_url_parts[0]

api = parsed_repo._replace(
path=f"{url_prefix}/api/3/action/", query=""
).geturl()

# Activity ID may be present either as a query parameter, activity_id
# or as part of the URL, under `/history/<activity-id>`. If `/history/`
# is present, that takes precedence over `activity_id`
activity_id = None
if "history" in dataset_url_parts:
activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1]
elif parse_qs(parsed_repo.query).get("activity_id") is not None:
activity_id = parse_qs(parsed_repo.query).get("activity_id")[0]

if activity_id:
fetch_url = f"{api}activity_data_show?" + urlencode(
{"id": activity_id, "object_type": "package"}
)
else:
fetch_url = f"{api}package_show?" + urlencode({"id": self.dataset_id})

client = AsyncHTTPClient()
try:
r = await client.fetch(fetch_url, user_agent="BinderHub")
except HTTPError:
return None

json_response = json.loads(r.body)
date = json_response["result"]["metadata_modified"]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
dataset_version = str(int(epoch))

self.record_id = f"{self.dataset_id}.v{dataset_version}"

return self.record_id

async def get_resolved_spec(self):
if not hasattr(self, "record_id"):
await self.get_resolved_ref()
return self.repo

def get_repo_url(self):
return self.repo

async def get_resolved_ref_url(self):
resolved_spec = await self.get_resolved_spec()
return resolved_spec

def get_build_slug(self):
return f"ckan-{self.dataset_id}"


class GitRepoProvider(RepoProvider):
"""Bare bones git repo provider.
Expand Down
3 changes: 2 additions & 1 deletion binderhub/static/js/src/form.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ export function getBuildFormValues() {
providerPrefix === "zenodo" ||
providerPrefix === "figshare" ||
providerPrefix === "dataverse" ||
providerPrefix === "hydroshare"
providerPrefix === "hydroshare" ||
providerPrefix === "ckan"
) {
ref = "";
}
Expand Down
48 changes: 48 additions & 0 deletions binderhub/tests/test_repoproviders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tornado.ioloop import IOLoop

from binderhub.repoproviders import (
CKANProvider,
DataverseProvider,
FigshareProvider,
GistRepoProvider,
Expand Down Expand Up @@ -209,6 +210,53 @@ async def test_dataverse(
assert spec == resolved_spec


@pytest.mark.parametrize(
"spec,resolved_spec,resolved_ref,resolved_ref_url,build_slug",
[
[
"https://demo.ckan.org/dataset/sample-dataset-1",
"https://demo.ckan.org/dataset/sample-dataset-1",
"sample-dataset-1.v",
"https://demo.ckan.org/dataset/sample-dataset-1",
"ckan-sample-dataset-1",
],
[
"https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f",
"https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f",
"chart-test.v1717501747",
"https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f",
"ckan-chart-test",
],
[
"https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f",
"https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f",
"chart-test.v1717501747",
"https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f",
"ckan-chart-test",
],
["https://demo.ckan.org/group/roger", None, None, None, None],
["https://demo.ckan.org/dataset/nosuchdataset", None, None, None, None],
],
)
async def test_ckan(spec, resolved_spec, resolved_ref, resolved_ref_url, build_slug):
provider = CKANProvider(spec=spec)

ref = await provider.get_resolved_ref()
if not resolved_ref:
# We are done here if we don't expect to resolve
return
assert resolved_ref in ref

slug = provider.get_build_slug()
assert slug == build_slug
repo_url = provider.get_repo_url()
assert repo_url == spec
ref_url = await provider.get_resolved_ref_url()
assert ref_url == resolved_ref_url
spec = await provider.get_resolved_spec()
assert spec == resolved_spec


@pytest.mark.github_api
@pytest.mark.parametrize(
"repo,unresolved_ref,resolved_ref",
Expand Down
2 changes: 2 additions & 0 deletions docs/source/developer/repoproviders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Currently supported providers, their prefixes and specs are:
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| Dataverse | ``dataverse`` | ``<dataverse-DOI>`` | `Dataverse <https://dataverse.org/>`_ is open source research data repository software installed all over the world. |
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| CKAN | ``ckan`` | ``<url-escaped-url>/<dataset-id>`` | `CKAN <https://ckan.org/>`_ is an open source data management system. |
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| Git | ``git`` | ``<url-escaped-url>/<commit-sha>`` | A generic repository provider for URLs that point directly to a git repository. |
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+

Expand Down
5 changes: 5 additions & 0 deletions docs/source/reference/repoproviders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ Module: :mod:`binderhub.repoproviders`
.. autoconfigurable:: DataverseProvider
:members:

:class:`CKANProvider`
---------------------------

.. autoconfigurable:: CKANProvider
:members:

:class:`GitRepoProvider`
---------------------------
Expand Down

0 comments on commit 497c2ac

Please sign in to comment.