Skip to content

Commit

Permalink
update ncml2stac notebook with simpler definition (less overrides) + …
Browse files Browse the repository at this point in the history
…fix linting
  • Loading branch information
fmigneault committed Jan 9, 2024
1 parent 2849e7f commit 3a0c531
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 49 deletions.
94 changes: 46 additions & 48 deletions notebooks/ncml2stac.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"outputs": [],
"source": [
"# NOTE:\n",
"# If using indented code block here (eg: 'if TYPE_CHECKING:'),\n",
"# If using code that is not preserved at runtime (eg: 'if TYPE_CHECKING:'),\n",
"# it is important to have other things than 'ipython2cwl' imports.\n",
"# When ported into the generated python script, imports from 'ipython2cwl' are removed,\n",
"# which can cause syntax/indent errors.\n",
Expand Down Expand Up @@ -71,8 +71,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-09T20:52:53.340694868Z",
"start_time": "2024-01-09T20:52:53.336885229Z"
"end_time": "2024-01-09T21:23:06.458322329Z",
"start_time": "2024-01-09T21:23:06.322097107Z"
}
},
"id": "61f43c81dc3aa6c2"
Expand Down Expand Up @@ -128,10 +128,17 @@
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-01-09 21:23:12.406510 [INFO] :: PYESSV :: Loading vocabularies from /home/francis/.esdoc/pyessv-archive ... please wait\n"
]
}
],
"source": [
"import hashlib\n",
"import json\n",
"import os\n",
"import tempfile\n",
Expand All @@ -142,7 +149,6 @@
"import numpy as np\n",
"import pystac\n",
"import requests\n",
"import siphon.catalog\n",
"from pydantic.networks import Url\n",
"\n",
"from STACpopulator.input import THREDDSLoader\n",
Expand All @@ -151,15 +157,15 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-09T20:53:47.648766094Z",
"start_time": "2024-01-09T20:53:47.274668578Z"
"end_time": "2024-01-09T21:23:13.576072319Z",
"start_time": "2024-01-09T21:23:11.298045355Z"
}
},
"id": "f68ea4339c5e4a9d"
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"outputs": [
{
"name": "stdout",
Expand Down Expand Up @@ -372,15 +378,15 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-09T20:53:00.747428182Z",
"start_time": "2024-01-09T20:53:00.626261513Z"
"end_time": "2024-01-09T21:23:20.489912892Z",
"start_time": "2024-01-09T21:23:19.994789137Z"
}
},
"id": "4fc2f66493dc56c5"
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"outputs": [
{
"name": "stderr",
Expand All @@ -406,58 +412,52 @@
" # https://svc.com/thredds/ncml/some/nested/netcdf.nc\"\n",
" # thredds_catalog_url should be:\n",
" # https://svc.com/thredds/catalog/some/nested/catalog.xml\"\n",
" def __init__(self, stac_host, thredds_catalog_url, target_item, update=False):\n",
" def __init__( # pylint: disable=W0231 # super init not called on purpose to avoid loading missing config\n",
" self,\n",
" stac_host,\n",
" thredds_catalog_url,\n",
" update=False,\n",
" ):\n",
" # FIXME: just reimplement what is needed (no config needed, we don't care about STAC Collections...)\n",
" self.target_item = target_item\n",
" self._stac_host = stac_host\n",
" self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=1)\n",
"\n",
" # FIXME: just reimplement it as needed\n",
" def __iter__(self) -> \"Iterator[Tuple[str, MutableMapping[str, Any]]]\":\n",
" \"\"\"Return a generator walking a THREDDS data catalog for datasets.\"\"\"\n",
" if self.catalog_head.datasets.items():\n",
" for item_name, ds in self.catalog_head.datasets.items():\n",
" # FIXME: filter for our item, ignore irrelevant entries (hopefully there's not too many...)\n",
" if item_name != self.target_item:\n",
" continue\n",
" attrs = self.extract_metadata(ds)\n",
" yield item_name, attrs\n",
" self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=0)\n",
" self.update = update\n",
"\n",
" if self._depth > 0:\n",
" for name, ref in self.catalog_head.catalog_refs.items():\n",
" self.catalog_head = ref.follow()\n",
" self._depth -= 1\n",
" yield from self\n",
" # FIXME: perform what ingest() does, but only for a single item and without STAC API POST request\n",
" def ncml2stac(self, target_item: str):\n",
" ds = self._ingest_pipeline[target_item]\n",
" ncml_data = self._ingest_pipeline.extract_metadata(ds)\n",
" stac_item = self.create_stac_item(target_item, ncml_data)\n",
" return stac_item\n",
"\n",
"\n",
"input_ncml_href_parsed = urlparse(input_ncml_href)\n",
"input_ncml_href_params = parse_qs(input_ncml_href_parsed.query)\n",
"if \"catalog\" in input_ncml_href_params:\n",
" input_ncml_target_href = input_ncml_href.split(\"?\")[0]\n",
" input_ncml_target_href = input_ncml_href.split(\"?\", 1)[0]\n",
" input_ncml_catalog_href = unquote(input_ncml_href_params[\"catalog\"][0])\n",
" input_ncml_catalog_href = os.path.splitext(input_ncml_catalog_href)[0] + \".xml\" # in case it was HTML\n",
"else:\n",
" input_ncml_target_href = input_ncml_href\n",
" input_ncml_catalog_href = input_ncml_href.split(\"?\", 1)[0] # just in case there's extra query params\n",
" input_ncml_catalog_href = input_ncml_catalog_href.replace(\"/ncml\", \"/catalog/\")\n",
" input_ncml_catalog_href = os.path.join(os.path.dirname(input_ncml_catalog_href), \"catalog.xml\")\n",
"input_ncml_catalog_xml = requests.get(input_ncml_catalog_href, headers={\"Accept\": \"text/xml, application/xml\"}, timeout=5).text\n",
"input_ncml_headers = {\"Accept\": \"text/xml, application/xml\"}\n",
"input_ncml_catalog_xml = requests.get(input_ncml_catalog_href, headers=input_ncml_headers, timeout=5).text\n",
"input_ncml_target_name = os.path.split(input_ncml_target_href)[-1]\n",
"## #catalog_xml = siphon.catalog.ET.fromstring(input_ncml_catalog_xml)\n",
"\n",
"# technically invalid STAC host, but just need something for URL schema validation\n",
"stac_host = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n",
"cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href, target_item=input_ncml_target_name)\n",
"stac_host_url = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n",
"cmip6_pop = CMIP6SingleFilePopulator(stac_host_url, input_ncml_catalog_href)\n",
"\n",
"# FIXME: do what ingest() would do, triggering the full processing chains via iter, but without the POST step to STAC API\n",
"ncml_name, ncml_data = next(iter(cmip6_pop._ingest_pipeline))\n",
"stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)"
"# generate the STAC Item definition corresponding to the NCML content\n",
"stac_item_data = cmip6_pop.ncml2stac(input_ncml_target_name)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-09T21:00:20.939457054Z",
"start_time": "2024-01-09T21:00:20.611135424Z"
"end_time": "2024-01-09T21:43:16.613980591Z",
"start_time": "2024-01-09T21:43:16.278982130Z"
}
},
"id": "299946ccd58e2efc"
Expand All @@ -474,7 +474,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 9,
"outputs": [
{
"name": "stdout",
Expand Down Expand Up @@ -733,8 +733,6 @@
"AnyDateTime = Union[datetime, date]\n",
"AnyJsonEncodable = Union[pystac.Item, np.ndarray, np.number, Url, Enum, AnyDateTime, \"JsonLike\"]\n",
"\n",
"##stac_item_data = stac_item.item.to_dict()\n",
"\n",
"def json_encode(obj: \"AnyJsonEncodable\") -> Union[\"JsonLike\", str]:\n",
" if isinstance(obj, (np.ndarray, np.number)):\n",
" return obj.tolist()\n",
Expand All @@ -750,8 +748,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-09T21:00:30.701468612Z",
"start_time": "2024-01-09T21:00:30.695291176Z"
"end_time": "2024-01-09T21:43:20.412846636Z",
"start_time": "2024-01-09T21:43:20.405259220Z"
}
},
"id": "4eeb52c23edccb31"
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ pytest-notebook
pytest-rerunfailures
pycodestyle
pydocstyle
pylint>=2.15.4; python_version >= "3.7"
pylint>=2.15.4,<3
pylint-per-file-ignores; python_version >= "3.7"
pylint_quotes
safety
Expand Down

0 comments on commit 3a0c531

Please sign in to comment.