diff --git a/building_a_catalog_simplified.ipynb b/building_a_catalog_simplified.ipynb new file mode 100644 index 0000000..134332a --- /dev/null +++ b/building_a_catalog_simplified.ipynb @@ -0,0 +1,8011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8329b54-462c-4a3a-a963-40422bef8902", + "metadata": {}, + "source": [ + "## Getting to grips with the ACCESS-NRI Intake Catalog\n", + "\n", + "There are three potential use cases for the ACCESS-NRI Intake Catalog:\n", + "1. You want to access some data someone else has *generated and catalogued*.\n", + "2. You want to *catalogue* some data you've generated.\n", + "3. You want to *access* some data you've catalogued.\n", + "\n", + "\n", + "Cases 2 & 3 can seem daunting - how do you build or access a custom catalog? This notebook should get you started, and will show you how to get to an ESM Datastore.\n", + "\n", + "___\n", + "#### This notebook contains a basic example of how catalog an experiment & access the data in it using a simple wrapper function, and should get you up to speed with cataloging your experiments.\n", + "___" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "dc8f78cb-c560-4a03-a699-6cd7dc383196", + "metadata": {}, + "outputs": [], + "source": [ + "### Imports, make sure we are in our home directory (don't change this until you're sure you know why you want to)\n", + "!cd \n", + "from pathlib import Path\n", + "import intake \n", + "from access_nri_intake.source.builders import AccessOm2Builder, AccessOm3Builder, AccessEsm15Builder, AccessCm2Builder\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdd4b7ef-4206-4d13-8c4a-fe7cccfaaf5c", + "metadata": {}, + "source": [ + "## Case 1: Accessing an experiment in the standard catalog" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "28697aa8-4635-4b43-aad6-ac198751c107", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

01deg_jra55v13_ryf9091 catalog with 34 dataset(s) from 11947 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
path11947
realm2
variable178
frequency5
start_date3361
end_date3360
variable_long_name181
variable_standard_name36
variable_cell_methods3
variable_units50
filename3469
file_id33
derived_variable0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# We've picked a demo experiment name here. You can find all available experiments in the standard catalog by calling `intake.cat.access_nri`.\n", + "experiment_name='01deg_jra55v13_ryf9091'\n", + "\n", + "cat = intake.cat.access_nri\n", + "esm_ds = cat[experiment_name]\n", + "# Note: We call this esm_ds as this object is an `esm_datastore`: see https://intake-esm.readthedocs.io/en/v2021.8.17/user-guide/overview.html#loading-a-catalog for more details\n", + "esm_ds" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4fbe5438-dd9f-4697-937f-668b2647ae99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 3GB\n",
+       "Dimensions:       (time: 1, d2: 2, nj: 2700, ni: 3600, nc: 5)\n",
+       "Coordinates:\n",
+       "  * time          (time) object 8B 1900-02-01 00:00:00\n",
+       "    TLON          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    TLAT          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    ULON          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    ULAT          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    NCAT          (nc) float32 20B dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "Dimensions without coordinates: d2, nj, ni, nc\n",
+       "Data variables: (12/49)\n",
+       "    time_bounds   (time, d2) object 16B dask.array<chunksize=(1, 2), meta=np.ndarray>\n",
+       "    tmask         (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    blkmask       (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    tarea         (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    uarea         (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    dxt           (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    ...            ...\n",
+       "    fmeltt_ai_m   (time, nj, ni) float32 39MB dask.array<chunksize=(1, 675, 900), meta=np.ndarray>\n",
+       "    opening_m     (time, nj, ni) float32 39MB dask.array<chunksize=(1, 675, 900), meta=np.ndarray>\n",
+       "    aicen_m       (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "    vicen_m       (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "    fmelttn_ai_m  (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "    flatn_ai_m    (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "Attributes: (12/24)\n",
+       "    title:                                    sea ice model output for CICE\n",
+       "    contents:                                 Diagnostic and Prognostic Varia...\n",
+       "    source:                                   Los Alamos Sea Ice Model (CICE)...\n",
+       "    comment:                                  This Year Has 365 days\n",
+       "    comment2:                                 File written on model date 1900...\n",
+       "    comment3:                                 seconds elapsed into model date...\n",
+       "    ...                                       ...\n",
+       "    intake_esm_attrs:variable_cell_methods:   ,,,,,,,,,,,,,,,,,,time: mean,ti...\n",
+       "    intake_esm_attrs:variable_units:          days since 1900-01-01 00:00:00,...\n",
+       "    intake_esm_attrs:filename:                iceh.1900-01.nc\n",
+       "    intake_esm_attrs:file_id:                 iceh_XXXX_XX\n",
+       "    intake_esm_attrs:_data_format_:           netcdf\n",
+       "    intake_esm_dataset_key:                   iceh_XXXX_XX.1mon
" + ], + "text/plain": [ + " Size: 3GB\n", + "Dimensions: (time: 1, d2: 2, nj: 2700, ni: 3600, nc: 5)\n", + "Coordinates:\n", + " * time (time) object 8B 1900-02-01 00:00:00\n", + " TLON (nj, ni) float32 39MB dask.array\n", + " TLAT (nj, ni) float32 39MB dask.array\n", + " ULON (nj, ni) float32 39MB dask.array\n", + " ULAT (nj, ni) float32 39MB dask.array\n", + " NCAT (nc) float32 20B dask.array\n", + "Dimensions without coordinates: d2, nj, ni, nc\n", + "Data variables: (12/49)\n", + " time_bounds (time, d2) object 16B dask.array\n", + " tmask (nj, ni) float32 39MB dask.array\n", + " blkmask (nj, ni) float32 39MB dask.array\n", + " tarea (nj, ni) float32 39MB dask.array\n", + " uarea (nj, ni) float32 39MB dask.array\n", + " dxt (nj, ni) float32 39MB dask.array\n", + " ... ...\n", + " fmeltt_ai_m (time, nj, ni) float32 39MB dask.array\n", + " opening_m (time, nj, ni) float32 39MB dask.array\n", + " aicen_m (time, nc, nj, ni) float32 194MB dask.array\n", + " vicen_m (time, nc, nj, ni) float32 194MB dask.array\n", + " fmelttn_ai_m (time, nc, nj, ni) float32 194MB dask.array\n", + " flatn_ai_m (time, nc, nj, ni) float32 194MB dask.array\n", + "Attributes: (12/24)\n", + " title: sea ice model output for CICE\n", + " contents: Diagnostic and Prognostic Varia...\n", + " source: Los Alamos Sea Ice Model (CICE)...\n", + " comment: This Year Has 365 days\n", + " comment2: File written on model date 1900...\n", + " comment3: seconds elapsed into model date...\n", + " ... ...\n", + " intake_esm_attrs:variable_cell_methods: ,,,,,,,,,,,,,,,,,,time: mean,ti...\n", + " intake_esm_attrs:variable_units: days since 1900-01-01 00:00:00,...\n", + " intake_esm_attrs:filename: iceh.1900-01.nc\n", + " intake_esm_attrs:file_id: iceh_XXXX_XX\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: iceh_XXXX_XX.1mon" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, if we wanted to subset and load some data, we could with, for example...\n", + "first_result = esm_ds.df.head(1).path\n", + "esm_ds.search(path=first_result).to_dask()" + ] + }, + { + "cell_type": "markdown", + "id": "3b388be4-2535-4e60-8397-159c80fe5811", + "metadata": {}, + "source": [ + "## Case 2: Generating a custom catalog for your experiment\n", + "\n", + "We imported the builders above: pick from the one of these that matches the data you want to catalog. If you think you need a new builder, open an issue at https://github.com/ACCESS-NRI/access-nri-intake-catalog\n", + "\n", + "Pick a builder from the following to set as `\"builder\"` below:\n", + "```python\n", + "from access_nri_intake.source.builders import AccessOm2Builder, AccessOm3Builder, AccessEsm15Builder, AccessCm2Builder\n", + "```\n", + "eg:\n", + "```python\n", + "catalog_config = {\n", + " ...\n", + " \"builder\" : AccessESM15Builder, \n", + " ...\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1a90873e-16e5-49e2-b1d4-edb05f510db8", + "metadata": {}, + "outputs": [], + "source": [ + "# Set Config in this cell\n", + "catalog_config = {\n", + " \"experiment_output_path\" : \"/g/data/ik11/outputs/access-om2/1deg_iamip2_CMCC-ESM2ssp126\", # Point this to your model output\n", + " \"builder\" : AccessOm2Builder, # This must be one of the builders imported above. If you need a new builder, get in touch!\n", + " \"catalog_directory\" : \"./catalog_demo\",\n", + " \"catalog_name\" : \"demo_datastore\",\n", + " \"description\" : \"An example datastore for ACCESS-OM2 1deg_iamip2_CMCC-ESM2ssp126\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b958a0de-940d-4289-aa9e-5c4ba317b3ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No catalog found at specified output location: catalog_demo/demo_datastore.json. Building catalog...\n", + "This could take a few minutes...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n", + "/home/189/ct1163/access-nri-intake-catalog/src/access_nri_intake/source/utils.py:111: UserWarning: Time coordinate does not include bounds information. Guessing start and end times.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully wrote ESM catalog json file to: file:///home/189/ct1163/catalog_demo/demo_datastore.json\n", + "Catalog built successfully!\n" + ] + } + ], + "source": [ + "# This cell defines and runs a function which will build and save our catalog.\n", + "def create_catalog(catalog_config):\n", + " \"\"\"\n", + " Takes the `catalog_config` dictionary in the cell above and builds a catalog with the specified options\n", + " \"\"\"\n", + " \n", + " catalog_fname = Path(catalog_config[\"catalog_directory\"]) / f\"{catalog_config['catalog_name']}.json\"\n", + "\n", + " catalog_written = catalog_fname.exists()\n", + " \n", + " parent_dir = catalog_fname.parent\n", + " if not parent_dir.exists():\n", + " parent_dir.mkdir(parents=True)\n", + "\n", + " if not catalog_written:\n", + " print(f\"No catalog found at specified output location: {catalog_fname}. Building catalog...\")\n", + " print(\"This could take a few minutes...\")\n", + " \n", + " builder = AccessOm2Builder(\n", + " path=catalog_config[\"experiment_output_path\"]\n", + " ).build()\n", + " \n", + " builder.save(\n", + " name=catalog_config[\"catalog_name\"],\n", + " directory=catalog_config[\"catalog_directory\"],\n", + " description=catalog_config[\"description\"],\n", + " )\n", + " print(\"Catalog built successfully!\")\n", + " else:\n", + " print(f\"Catalog alerady found at specified output location: {catalog_fname}. Did you mean to open it?\")\n", + " \n", + "\n", + "# Now run it!\n", + "create_catalog(catalog_config)" + ] + }, + { + "cell_type": "markdown", + "id": "18cf2c20-f4a2-4189-9ba6-c6910c3b6c6e", + "metadata": {}, + "source": [ + "## Case 3: Using a custom catalog\n", + "\n", + "Once we've built a custom catalog, we can open it using the convenience function defined below - it uses the same configuration dictionary `catalog_config`, and so *nothing needs to be changed - just run the cell*. This will give us the same type of object that we got in __Case 1__ above, so from this point onwards, we can use it as if it were the standard catalog.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "122745d5-d4e0-4f48-ad93-857093b7eef7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Catalog found at specified output location: catalog_demo/demo_datastore.json. Opening catalog...\n" + ] + } + ], + "source": [ + "# This cell defines and runs a function which will open our catalog.\n", + "def open_catalog(catalog_config):\n", + " \"\"\"\n", + " Looks for a catalog in the specified output location & attempts to open it\n", + " \"\"\"\n", + " \n", + " catalog_fname = Path(catalog_config[\"catalog_directory\"]) / f\"{catalog_config['catalog_name']}.json\"\n", + "\n", + " if not catalog_fname.exists():\n", + " print(f\"No catalog found at specified output location: {catalog_fname}. Did you mean to build a catalog?\")\n", + " else:\n", + " print(f\"Catalog found at specified output location: {catalog_fname}. Opening catalog...\")\n", + " \n", + " cat = intake.open_esm_datastore(\n", + " catalog_fname,\n", + " columns_with_iterables=[\"variable\"] # We need to tell the datastore that the variable_column contains iterables or it won't work correctly\n", + " )\n", + "\n", + " return cat\n", + "\n", + "# Now run it!\n", + "custom_esm_ds = open_catalog(catalog_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f6c1613b-ee19-4da2-bd65-643c7a0185c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 2GB\n",
+       "Dimensions:      (time: 365, d2: 2, nj: 300, ni: 360, nc: 5)\n",
+       "Coordinates:\n",
+       "  * time         (time) datetime64[ns] 3kB 2015-01-02 2015-01-03 ... 2016-01-01\n",
+       "    TLON         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    TLAT         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    ULON         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    ULAT         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "Dimensions without coordinates: d2, nj, ni, nc\n",
+       "Data variables: (12/24)\n",
+       "    time_bounds  (time, d2) datetime64[ns] 6kB dask.array<chunksize=(1, 2), meta=np.ndarray>\n",
+       "    NCAT         (nc) float32 20B dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    tmask        (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    blkmask      (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    tarea        (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    dxt          (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    ...           ...\n",
+       "    algal_N      (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    skl_Nit      (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    ml_Nit       (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    fNO_ai       (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    fN_ai        (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    PP_net       (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "Attributes: (12/27)\n",
+       "    title:                                    sea ice model output for CICE\n",
+       "    contents:                                 Diagnostic and Prognostic Varia...\n",
+       "    source:                                   Los Alamos Sea Ice Model (CICE)...\n",
+       "    time_period_freq:                         day_1\n",
+       "    comment:                                  This year has 365 days\n",
+       "    comment2:                                 File written on model date 2015...\n",
+       "    ...                                       ...\n",
+       "    intake_esm_attrs:variable_standard_name:  ['', '', '', '', '', '', '', ''...\n",
+       "    intake_esm_attrs:variable_cell_methods:   ['', '', '', '', '', '', '', ''...\n",
+       "    intake_esm_attrs:variable_units:          ['days since 2015-01-01 00:00:0...\n",
+       "    intake_esm_attrs:realm:                   seaIce\n",
+       "    intake_esm_attrs:_data_format_:           netcdf\n",
+       "    intake_esm_dataset_key:                   iceh_XXX_daily.1day
" + ], + "text/plain": [ + " Size: 2GB\n", + "Dimensions: (time: 365, d2: 2, nj: 300, ni: 360, nc: 5)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 3kB 2015-01-02 2015-01-03 ... 2016-01-01\n", + " TLON (nj, ni) float32 432kB dask.array\n", + " TLAT (nj, ni) float32 432kB dask.array\n", + " ULON (nj, ni) float32 432kB dask.array\n", + " ULAT (nj, ni) float32 432kB dask.array\n", + "Dimensions without coordinates: d2, nj, ni, nc\n", + "Data variables: (12/24)\n", + " time_bounds (time, d2) datetime64[ns] 6kB dask.array\n", + " NCAT (nc) float32 20B dask.array\n", + " tmask (nj, ni) float32 432kB dask.array\n", + " blkmask (nj, ni) float32 432kB dask.array\n", + " tarea (nj, ni) float32 432kB dask.array\n", + " dxt (nj, ni) float32 432kB dask.array\n", + " ... ...\n", + " algal_N (time, nj, ni) float32 158MB dask.array\n", + " skl_Nit (time, nj, ni) float32 158MB dask.array\n", + " ml_Nit (time, nj, ni) float32 158MB dask.array\n", + " fNO_ai (time, nj, ni) float32 158MB dask.array\n", + " fN_ai (time, nj, ni) float32 158MB dask.array\n", + " PP_net (time, nj, ni) float32 158MB dask.array\n", + "Attributes: (12/27)\n", + " title: sea ice model output for CICE\n", + " contents: Diagnostic and Prognostic Varia...\n", + " source: Los Alamos Sea Ice Model (CICE)...\n", + " time_period_freq: day_1\n", + " comment: This year has 365 days\n", + " comment2: File written on model date 2015...\n", + " ... ...\n", + " intake_esm_attrs:variable_standard_name: ['', '', '', '', '', '', '', ''...\n", + " intake_esm_attrs:variable_cell_methods: ['', '', '', '', '', '', '', ''...\n", + " intake_esm_attrs:variable_units: ['days since 2015-01-01 00:00:0...\n", + " intake_esm_attrs:realm: seaIce\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: iceh_XXX_daily.1day" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, if we wanted to subset and load some data, we could with, for example...\n", + "first_result = custom_esm_ds.df.head(1).path\n", + "custom_esm_ds.search(path=first_result).to_dask()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:analysis3-unstable]", + "language": "python", + "name": "conda-env-analysis3-unstable-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/catalog_user_recipe.ipynb b/catalog_user_recipe.ipynb new file mode 100644 index 0000000..b9283f3 --- /dev/null +++ b/catalog_user_recipe.ipynb @@ -0,0 +1,10614 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8329b54-462c-4a3a-a963-40422bef8902", + "metadata": {}, + "source": [ + "## Using an Intake Catalog to analyse data\n", + "\n", + "There are two ways you might want to access & use an Intake Catalog:\n", + "1. You want to open a *datastore* for an experiment - and you know where it is.\n", + "2. You want to *search for and access a catalog* - from the ACCESS-NRI Intake Catalog.\n", + "\n", + "#### These two workflows will both lead you to the same data format: an xarray dataset.\n", + "\n", + "This tutorial demonstrates how to access the data you're after in either of these scenarios.\n", + "___\n", + "*This notebook was originally run on a large ARE instance. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dc8f78cb-c560-4a03-a699-6cd7dc383196", + "metadata": {}, + "outputs": [], + "source": [ + "# For now, we only need to import intake.\n", + "import intake" + ] + }, + { + "cell_type": "markdown", + "id": "bdd4b7ef-4206-4d13-8c4a-fe7cccfaaf5c", + "metadata": {}, + "source": [ + "## Case 1: Opening a datastore when you know where the experiment data is." + ] + }, + { + "cell_type": "markdown", + "id": "7461ec38-9cc8-4523-8339-43c72cb30057", + "metadata": {}, + "source": [ + "In this case, we've run an access-model (say ACCESS-OM3) and we've already generated an intake esm datastore for the model run. (The user-script configured in payu generates an intake-esm datastore automatically at the end of the model run for ACCESS-OM3 , typically named `intake_datastore.json` and stored in the archive folder of the model run. )" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "28697aa8-4635-4b43-aad6-ac198751c107", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

demo_datastore catalog with 48 dataset(s) from 4128 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
filename2853
file_id48
path4128
filename_timestamp430
frequency4
start_date174
end_date173
variable116
variable_long_name112
variable_standard_name8
variable_cell_methods6
variable_units104
realm2
derived_variable0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# We've picked a demo experiment name here. You can find all available experiments in the standard catalog by calling `intake.cat.access_nri`.\n", + "demo_datastore_path = \"/home/189/ct1163/catalog_demo/intake_datastore.json\"\n", + "\n", + "esm_ds = intake.open_esm_datastore(demo_datastore_path,\n", + " columns_with_iterables=[\n", + " 'variable', # We need to tell the datastore that the variable_column contains iterables or it won't work correctly. \n", + " 'variable_long_name', # In fact, all the variable_* columns contain iterables - but we're just going to search for names\n", + " 'variable_standard_name',\n", + " ]\n", + ")\n", + "\n", + "# Note: We call this esm_ds as this object is an `esm_datastore`: see https://intake-esm.readthedocs.io/en/v2021.8.17/user-guide/overview.html#loading-a-catalog for more details\n", + "esm_ds" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4fbe5438-dd9f-4697-937f-668b2647ae99", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 2GB\n",
+       "Dimensions:      (time: 365, d2: 2, nj: 300, ni: 360, nc: 5)\n",
+       "Coordinates:\n",
+       "  * time         (time) datetime64[ns] 3kB 2015-01-02 2015-01-03 ... 2016-01-01\n",
+       "    TLON         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    TLAT         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    ULON         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    ULAT         (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "Dimensions without coordinates: d2, nj, ni, nc\n",
+       "Data variables: (12/24)\n",
+       "    time_bounds  (time, d2) datetime64[ns] 6kB dask.array<chunksize=(1, 2), meta=np.ndarray>\n",
+       "    NCAT         (nc) float32 20B dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    tmask        (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    blkmask      (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    tarea        (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    dxt          (nj, ni) float32 432kB dask.array<chunksize=(150, 180), meta=np.ndarray>\n",
+       "    ...           ...\n",
+       "    algal_N      (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    skl_Nit      (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    ml_Nit       (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    fNO_ai       (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    fN_ai        (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "    PP_net       (time, nj, ni) float32 158MB dask.array<chunksize=(1, 150, 180), meta=np.ndarray>\n",
+       "Attributes: (12/27)\n",
+       "    title:                                    sea ice model output for CICE\n",
+       "    contents:                                 Diagnostic and Prognostic Varia...\n",
+       "    source:                                   Los Alamos Sea Ice Model (CICE)...\n",
+       "    time_period_freq:                         day_1\n",
+       "    comment:                                  This year has 365 days\n",
+       "    comment2:                                 File written on model date 2015...\n",
+       "    ...                                       ...\n",
+       "    intake_esm_attrs:variable_standard_name:  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n",
+       "    intake_esm_attrs:variable_cell_methods:   ['', '', '', '', '', '', '', ''...\n",
+       "    intake_esm_attrs:variable_units:          ['days since 2015-01-01 00:00:0...\n",
+       "    intake_esm_attrs:realm:                   seaIce\n",
+       "    intake_esm_attrs:_data_format_:           netcdf\n",
+       "    intake_esm_dataset_key:                   iceh_XXX_daily.1day
" + ], + "text/plain": [ + " Size: 2GB\n", + "Dimensions: (time: 365, d2: 2, nj: 300, ni: 360, nc: 5)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 3kB 2015-01-02 2015-01-03 ... 2016-01-01\n", + " TLON (nj, ni) float32 432kB dask.array\n", + " TLAT (nj, ni) float32 432kB dask.array\n", + " ULON (nj, ni) float32 432kB dask.array\n", + " ULAT (nj, ni) float32 432kB dask.array\n", + "Dimensions without coordinates: d2, nj, ni, nc\n", + "Data variables: (12/24)\n", + " time_bounds (time, d2) datetime64[ns] 6kB dask.array\n", + " NCAT (nc) float32 20B dask.array\n", + " tmask (nj, ni) float32 432kB dask.array\n", + " blkmask (nj, ni) float32 432kB dask.array\n", + " tarea (nj, ni) float32 432kB dask.array\n", + " dxt (nj, ni) float32 432kB dask.array\n", + " ... ...\n", + " algal_N (time, nj, ni) float32 158MB dask.array\n", + " skl_Nit (time, nj, ni) float32 158MB dask.array\n", + " ml_Nit (time, nj, ni) float32 158MB dask.array\n", + " fNO_ai (time, nj, ni) float32 158MB dask.array\n", + " fN_ai (time, nj, ni) float32 158MB dask.array\n", + " PP_net (time, nj, ni) float32 158MB dask.array\n", + "Attributes: (12/27)\n", + " title: sea ice model output for CICE\n", + " contents: Diagnostic and Prognostic Varia...\n", + " source: Los Alamos Sea Ice Model (CICE)...\n", + " time_period_freq: day_1\n", + " comment: This year has 365 days\n", + " comment2: File written on model date 2015...\n", + " ... ...\n", + " intake_esm_attrs:variable_standard_name: ,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n", + " intake_esm_attrs:variable_cell_methods: ['', '', '', '', '', '', '', ''...\n", + " intake_esm_attrs:variable_units: ['days since 2015-01-01 00:00:0...\n", + " intake_esm_attrs:realm: seaIce\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: iceh_XXX_daily.1day" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, if we wanted to subset and load some data, we could with, for example...\n", + "first_result = esm_ds.df.head(1).path\n", + "esm_ds.search(path=first_result).to_dask()" + ] + }, + { + "cell_type": "markdown", + "id": "3b388be4-2535-4e60-8397-159c80fe5811", + "metadata": {}, + "source": [ + "## Case 2: Search for and access a datastore - from the ACCESS-NRI Intake Catalog." + ] + }, + { + "cell_type": "markdown", + "id": "9dace72c-318a-4b72-b33e-ce7550ee335f", + "metadata": {}, + "source": [ + "We might want to compare to some existing model results from access-om2. This data can be found in the access-nri intake catalog." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1a90873e-16e5-49e2-b1d4-edb05f510db8", + "metadata": {}, + "outputs": [], + "source": [ + "# We imported intake above. To get the ACCESS-NRI Intake catalog, we use the following line:\n", + "access_nri_catalog = intake.cat.access_nri" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b958a0de-940d-4289-aa9e-5c4ba317b3ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

01deg_jra55v13_ryf9091 catalog with 34 dataset(s) from 11947 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
path11947
realm2
variable178
frequency5
start_date3361
end_date3360
variable_long_name181
variable_standard_name36
variable_cell_methods3
variable_units50
filename3469
file_id33
derived_variable0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# We've picked a demo experiment name here. You can find all available experiments in the standard catalog by calling `intake.cat.access_nri`.\n", + "experiment_name='01deg_jra55v13_ryf9091'\n", + "\n", + "cat = intake.cat.access_nri\n", + "esm_ds = cat[experiment_name]\n", + "\n", + "# Note: Again, we call this esm_ds as this object is an `esm_datastore`: see https://intake-esm.readthedocs.io/en/v2021.8.17/user-guide/overview.html#loading-a-catalog for more details\n", + "\n", + "# Once we have our esm datastore, we can interact with it the same way we did above.\n", + "\n", + "esm_ds" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "44713a18-40a5-4f57-a50b-cc6c5dd70fd6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 3GB\n",
+       "Dimensions:       (time: 1, d2: 2, nj: 2700, ni: 3600, nc: 5)\n",
+       "Coordinates:\n",
+       "  * time          (time) object 8B 1900-02-01 00:00:00\n",
+       "    TLON          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    TLAT          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    ULON          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    ULAT          (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    NCAT          (nc) float32 20B dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "Dimensions without coordinates: d2, nj, ni, nc\n",
+       "Data variables: (12/49)\n",
+       "    time_bounds   (time, d2) object 16B dask.array<chunksize=(1, 2), meta=np.ndarray>\n",
+       "    tmask         (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    blkmask       (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    tarea         (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    uarea         (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    dxt           (nj, ni) float32 39MB dask.array<chunksize=(675, 900), meta=np.ndarray>\n",
+       "    ...            ...\n",
+       "    fmeltt_ai_m   (time, nj, ni) float32 39MB dask.array<chunksize=(1, 675, 900), meta=np.ndarray>\n",
+       "    opening_m     (time, nj, ni) float32 39MB dask.array<chunksize=(1, 675, 900), meta=np.ndarray>\n",
+       "    aicen_m       (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "    vicen_m       (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "    fmelttn_ai_m  (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "    flatn_ai_m    (time, nc, nj, ni) float32 194MB dask.array<chunksize=(1, 1, 675, 900), meta=np.ndarray>\n",
+       "Attributes: (12/24)\n",
+       "    title:                                    sea ice model output for CICE\n",
+       "    contents:                                 Diagnostic and Prognostic Varia...\n",
+       "    source:                                   Los Alamos Sea Ice Model (CICE)...\n",
+       "    comment:                                  This Year Has 365 days\n",
+       "    comment2:                                 File written on model date 1900...\n",
+       "    comment3:                                 seconds elapsed into model date...\n",
+       "    ...                                       ...\n",
+       "    intake_esm_attrs:variable_cell_methods:   ,,,,,,,,,,,,,,,,,,time: mean,ti...\n",
+       "    intake_esm_attrs:variable_units:          days since 1900-01-01 00:00:00,...\n",
+       "    intake_esm_attrs:filename:                iceh.1900-01.nc\n",
+       "    intake_esm_attrs:file_id:                 iceh_XXXX_XX\n",
+       "    intake_esm_attrs:_data_format_:           netcdf\n",
+       "    intake_esm_dataset_key:                   iceh_XXXX_XX.1mon
" + ], + "text/plain": [ + " Size: 3GB\n", + "Dimensions: (time: 1, d2: 2, nj: 2700, ni: 3600, nc: 5)\n", + "Coordinates:\n", + " * time (time) object 8B 1900-02-01 00:00:00\n", + " TLON (nj, ni) float32 39MB dask.array\n", + " TLAT (nj, ni) float32 39MB dask.array\n", + " ULON (nj, ni) float32 39MB dask.array\n", + " ULAT (nj, ni) float32 39MB dask.array\n", + " NCAT (nc) float32 20B dask.array\n", + "Dimensions without coordinates: d2, nj, ni, nc\n", + "Data variables: (12/49)\n", + " time_bounds (time, d2) object 16B dask.array\n", + " tmask (nj, ni) float32 39MB dask.array\n", + " blkmask (nj, ni) float32 39MB dask.array\n", + " tarea (nj, ni) float32 39MB dask.array\n", + " uarea (nj, ni) float32 39MB dask.array\n", + " dxt (nj, ni) float32 39MB dask.array\n", + " ... ...\n", + " fmeltt_ai_m (time, nj, ni) float32 39MB dask.array\n", + " opening_m (time, nj, ni) float32 39MB dask.array\n", + " aicen_m (time, nc, nj, ni) float32 194MB dask.array\n", + " vicen_m (time, nc, nj, ni) float32 194MB dask.array\n", + " fmelttn_ai_m (time, nc, nj, ni) float32 194MB dask.array\n", + " flatn_ai_m (time, nc, nj, ni) float32 194MB dask.array\n", + "Attributes: (12/24)\n", + " title: sea ice model output for CICE\n", + " contents: Diagnostic and Prognostic Varia...\n", + " source: Los Alamos Sea Ice Model (CICE)...\n", + " comment: This Year Has 365 days\n", + " comment2: File written on model date 1900...\n", + " comment3: seconds elapsed into model date...\n", + " ... ...\n", + " intake_esm_attrs:variable_cell_methods: ,,,,,,,,,,,,,,,,,,time: mean,ti...\n", + " intake_esm_attrs:variable_units: days since 1900-01-01 00:00:00,...\n", + " intake_esm_attrs:filename: iceh.1900-01.nc\n", + " intake_esm_attrs:file_id: iceh_XXXX_XX\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: iceh_XXXX_XX.1mon" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, if we wanted to subset and load some data, we could with, for example...\n", + "first_result = esm_ds.df.head(1).path\n", + "esm_ds.search(path=first_result).to_dask()" + ] + }, + { + "cell_type": "markdown", + "id": "88b8951f-98ff-48da-ae7a-929b194fdeac", + "metadata": {}, + "source": [ + "___\n", + "# Using a Datastore - applies to datastores from 1. & 2. \n", + "___" + ] + }, + { + "cell_type": "markdown", + "id": "27b46641-0de8-412d-aaa1-bb5933950b3e", + "metadata": {}, + "source": [ + "## Searching a datastore\n", + "\n", + "ESM Datastores contain a lot of useful information that lets us search for the data we're after, before we load it into dask/xarray. Lets look at some examples." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "38e5f86f-f5da-4399-a439-f5e91a4de645", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Client

\n", + "

Client-7afac66b-b12f-11ef-8206-000007cdfe80

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", + " Dashboard: /proxy/8787/status\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "

Cluster Info

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

LocalCluster

\n", + "

f22edfa7

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + "
\n", + " Dashboard: /proxy/8787/status\n", + " \n", + " Workers: 7\n", + "
\n", + " Total threads: 7\n", + " \n", + " Total memory: 32.00 GiB\n", + "
Status: runningUsing processes: True
\n", + "\n", + "
\n", + " \n", + "

Scheduler Info

\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

Scheduler

\n", + "

Scheduler-22bd2e94-42ab-467f-b23e-8bdfc90fcd73

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Comm: tcp://127.0.0.1:43579\n", + " \n", + " Workers: 7\n", + "
\n", + " Dashboard: /proxy/8787/status\n", + " \n", + " Total threads: 7\n", + "
\n", + " Started: Just now\n", + " \n", + " Total memory: 32.00 GiB\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "

Workers

\n", + "
\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 0

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:33687\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/46001/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:46467\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-gf8vo9lz\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 1

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:45257\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/35963/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:35487\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-r8pdwszn\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 2

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:40103\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/44275/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:37115\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-g7o3_xof\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 3

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:45143\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/37669/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:39899\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-_sctlqc1\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 4

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:40513\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/37813/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:46597\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-lzof_0q0\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 5

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:32865\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/34691/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:43811\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-leffdmxa\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 6

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:39425\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /proxy/35937/status\n", + " \n", + " Memory: 4.57 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:41435\n", + "
\n", + " Local directory: /jobfs/129999899.gadi-pbs/dask-scratch-space/worker-4ngybr46\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, we import the Dask client, so we can get a bit more control over loadign variables\n", + "from dask.distributed import Client\n", + "client = Client(threads_per_worker=1)\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "23ff97f1-e9c6-4c8f-9a82-f4b7e3024d90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

01deg_jra55v13_ryf9091 catalog with 34 dataset(s) from 11947 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
path11947
realm2
variable178
frequency5
start_date3361
end_date3360
variable_long_name181
variable_standard_name36
variable_cell_methods3
variable_units50
filename3469
file_id33
derived_variable0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "esm_ds" + ] + }, + { + "cell_type": "markdown", + "id": "9c023f93-5492-4cd3-bf86-067e1edf36df", + "metadata": {}, + "source": [ + "We can see that we have a number of different fields in our esm datastore:\n", + "- filename\n", + "- file id\n", + "- ...\n", + "- realm\n", + "- derived_variable\n", + "\n", + "Other than `derived_variable`, we can use any of these to search for data. Lets say we are interested in temperature and salinity: lets search for anything where `variable` contains the string `temp`. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "23e08f63-1b5c-49f0-b7b1-8227f8744cc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

01deg_jra55v13_ryf9091 catalog with 5 dataset(s) from 1593 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
path1593
realm1
variable29
frequency4
start_date1403
end_date1404
variable_long_name29
variable_standard_name11
variable_cell_methods2
variable_units16
filename15
file_id4
derived_variable0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "temp_ds = esm_ds.search(variable='temp')\n", + "temp_ds" + ] + }, + { + "cell_type": "markdown", + "id": "8d38dada-6ad7-4c4e-8f59-a3eebc1ab7e4", + "metadata": {}, + "source": [ + "This gives us 5 datasets, with 4 frequencies. We can load them all at once with `.to_dataset_dict()`.\n", + "\n", + "We won't run the cell below - it uses an awful lot of memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a874830-1d5a-4e9f-bc43-ea8f4d5e5f95", + "metadata": {}, + "outputs": [], + "source": [ + "temp_ds.to_dataset_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "cc82d6a3-3c8d-46d1-bb86-e04ee4115df9", + "metadata": {}, + "source": [ + "-> The keys in the returned dictionary of datasets are constructed as follows:\n", + "\t'file_id.frequency'\n", + "\n", + " 60.00% [3/5 00:37<00:25]" + ] + }, + { + "cell_type": "markdown", + "id": "5497a309-58d8-4a88-8d30-10ededda0a02", + "metadata": {}, + "source": [ + "We can see from the ways that the keys have been constructed (`file_id.frequency`) that we should be able to get a single dataset if we pick out the right combination of file_id and frequency. \n", + "\n", + "Lets do that." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f5ce5320-bdd5-4431-b7e4-9544a7cd92a9", + "metadata": {}, + "outputs": [], + "source": [ + "# We can search in a couple of ways - by chaining searches, or one at a time.\n", + "temp_ds_chained = esm_ds.search(variable='temp').search(frequency='3mon')\n", + "temp_ds_one_search = esm_ds.search(variable='temp',frequency='3mon')\n", + "\n", + "# We can also search for multiple queries at once: eg.\n", + "\n", + "temp_salt_ds = esm_ds.search(variable=['temp','salt'],frequency='3mon')\n", + "# or\n", + "temp_ds_multifreq = esm_ds.search(variable='temp',frequency=['1mon','3mon'])\n", + "# or even (the following two are equivalent)\n", + "temp_salt_ds_multifreq = esm_ds.search(variable=['temp','salt'],frequency=['1mon','3mon'])\n", + "temp_salt_ds_multifreq = esm_ds.search(variable=['temp','salt']).search(frequency=['1mon','3mon'])\n", + "\n", + "# We can search for any combination of the fields in our esm datastore\n", + "# and if we want to search for multiple possible variables/frequencies/ dates, etc etc, we can use lists.\n", + "\n", + "# We can also use regular expressions (regex's) to search for patterns: eg\n", + "temp_ds_1990s = esm_ds.search(variable='temp',start_date='199.*')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1da84f71-1605-4add-9129-ba9088fa6cb5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pathrealmvariablefrequencystart_dateend_datevariable_long_namevariable_standard_namevariable_cell_methodsvariable_unitsfilenamefile_id
0/g/data/ik11/outputs/access-om2-01/01deg_jra55v13_ryf9091/output356/ocean/ocean.ncocean[temp, pot_temp, salt, age_global, u, v, wt, dzt, pot_rho_2, pot_rho_0, tx_trans, ty_trans, ty_trans_submeso, tx_trans_rho, ty_trans_rho, ty_trans_nrho_submeso, temp_xflux_adv, temp_yflux_adv, buo...1mon1990-01-01, 00:00:001990-04-01, 00:00:00[Conservative temperature, Potential temperature, Practical Salinity, Age (global), i-current, j-current, dia-surface velocity T-points, t-cell thickness, potential density referenced to 2000 dbar...[sea_water_conservative_temperature, sea_water_potential_temperature, sea_water_salinity, sea_water_age_since_surface_contact, sea_water_x_velocity, sea_water_y_velocity, , cell_thickness, , sea_w...[time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, tim...[K, K, psu, yr, m/sec, m/sec, m/sec, m, kg/m^3, kg/m^3, kg/s, kg/s, kg/s, kg/s, kg/s, kg/s, Watts, Watts, 1/s^2, 1/sec^3, (kg/m^3)*(m^2/s^2), (kg/m^3)*(m^2/s^2), 1/sec^4, days since 1900-01-01 00:...ocean.ncocean
1/g/data/ik11/outputs/access-om2-01/01deg_jra55v13_ryf9091/output357/ocean/ocean.ncocean[temp, pot_temp, salt, age_global, u, v, wt, dzt, pot_rho_2, pot_rho_0, tx_trans, ty_trans, ty_trans_submeso, tx_trans_rho, ty_trans_rho, ty_trans_nrho_submeso, temp_xflux_adv, temp_yflux_adv, buo...1mon1990-04-01, 00:00:001990-07-01, 00:00:00[Conservative temperature, Potential temperature, Practical Salinity, Age (global), i-current, j-current, dia-surface velocity T-points, t-cell thickness, potential density referenced to 2000 dbar...[sea_water_conservative_temperature, sea_water_potential_temperature, sea_water_salinity, sea_water_age_since_surface_contact, sea_water_x_velocity, sea_water_y_velocity, , cell_thickness, , sea_w...[time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, tim...[K, K, psu, yr, m/sec, m/sec, m/sec, m, kg/m^3, kg/m^3, kg/s, kg/s, kg/s, kg/s, kg/s, kg/s, Watts, Watts, 1/s^2, 1/sec^3, (kg/m^3)*(m^2/s^2), (kg/m^3)*(m^2/s^2), 1/sec^4, days since 1900-01-01 00:...ocean.ncocean
2/g/data/ik11/outputs/access-om2-01/01deg_jra55v13_ryf9091/output358/ocean/ocean.ncocean[temp, pot_temp, salt, age_global, u, v, wt, dzt, pot_rho_2, pot_rho_0, tx_trans, ty_trans, ty_trans_submeso, tx_trans_rho, ty_trans_rho, ty_trans_nrho_submeso, temp_xflux_adv, temp_yflux_adv, buo...1mon1990-07-01, 00:00:001990-10-01, 00:00:00[Conservative temperature, Potential temperature, Practical Salinity, Age (global), i-current, j-current, dia-surface velocity T-points, t-cell thickness, potential density referenced to 2000 dbar...[sea_water_conservative_temperature, sea_water_potential_temperature, sea_water_salinity, sea_water_age_since_surface_contact, sea_water_x_velocity, sea_water_y_velocity, , cell_thickness, , sea_w...[time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, tim...[K, K, psu, yr, m/sec, m/sec, m/sec, m, kg/m^3, kg/m^3, kg/s, kg/s, kg/s, kg/s, kg/s, kg/s, Watts, Watts, 1/s^2, 1/sec^3, (kg/m^3)*(m^2/s^2), (kg/m^3)*(m^2/s^2), 1/sec^4, days since 1900-01-01 00:...ocean.ncocean
\n", + "
" + ], + "text/plain": [ + " path \\\n", + "0 /g/data/ik11/outputs/access-om2-01/01deg_jra55v13_ryf9091/output356/ocean/ocean.nc \n", + "1 /g/data/ik11/outputs/access-om2-01/01deg_jra55v13_ryf9091/output357/ocean/ocean.nc \n", + "2 /g/data/ik11/outputs/access-om2-01/01deg_jra55v13_ryf9091/output358/ocean/ocean.nc \n", + "\n", + " realm \\\n", + "0 ocean \n", + "1 ocean \n", + "2 ocean \n", + "\n", + " variable \\\n", + "0 [temp, pot_temp, salt, age_global, u, v, wt, dzt, pot_rho_2, pot_rho_0, tx_trans, ty_trans, ty_trans_submeso, tx_trans_rho, ty_trans_rho, ty_trans_nrho_submeso, temp_xflux_adv, temp_yflux_adv, buo... \n", + "1 [temp, pot_temp, salt, age_global, u, v, wt, dzt, pot_rho_2, pot_rho_0, tx_trans, ty_trans, ty_trans_submeso, tx_trans_rho, ty_trans_rho, ty_trans_nrho_submeso, temp_xflux_adv, temp_yflux_adv, buo... \n", + "2 [temp, pot_temp, salt, age_global, u, v, wt, dzt, pot_rho_2, pot_rho_0, tx_trans, ty_trans, ty_trans_submeso, tx_trans_rho, ty_trans_rho, ty_trans_nrho_submeso, temp_xflux_adv, temp_yflux_adv, buo... \n", + "\n", + " frequency start_date end_date \\\n", + "0 1mon 1990-01-01, 00:00:00 1990-04-01, 00:00:00 \n", + "1 1mon 1990-04-01, 00:00:00 1990-07-01, 00:00:00 \n", + "2 1mon 1990-07-01, 00:00:00 1990-10-01, 00:00:00 \n", + "\n", + " variable_long_name \\\n", + "0 [Conservative temperature, Potential temperature, Practical Salinity, Age (global), i-current, j-current, dia-surface velocity T-points, t-cell thickness, potential density referenced to 2000 dbar... \n", + "1 [Conservative temperature, Potential temperature, Practical Salinity, Age (global), i-current, j-current, dia-surface velocity T-points, t-cell thickness, potential density referenced to 2000 dbar... \n", + "2 [Conservative temperature, Potential temperature, Practical Salinity, Age (global), i-current, j-current, dia-surface velocity T-points, t-cell thickness, potential density referenced to 2000 dbar... \n", + "\n", + " variable_standard_name \\\n", + "0 [sea_water_conservative_temperature, sea_water_potential_temperature, sea_water_salinity, sea_water_age_since_surface_contact, sea_water_x_velocity, sea_water_y_velocity, , cell_thickness, , sea_w... \n", + "1 [sea_water_conservative_temperature, sea_water_potential_temperature, sea_water_salinity, sea_water_age_since_surface_contact, sea_water_x_velocity, sea_water_y_velocity, , cell_thickness, , sea_w... \n", + "2 [sea_water_conservative_temperature, sea_water_potential_temperature, sea_water_salinity, sea_water_age_since_surface_contact, sea_water_x_velocity, sea_water_y_velocity, , cell_thickness, , sea_w... \n", + "\n", + " variable_cell_methods \\\n", + "0 [time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, tim... \n", + "1 [time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, tim... \n", + "2 [time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, time: mean, tim... \n", + "\n", + " variable_units \\\n", + "0 [K, K, psu, yr, m/sec, m/sec, m/sec, m, kg/m^3, kg/m^3, kg/s, kg/s, kg/s, kg/s, kg/s, kg/s, Watts, Watts, 1/s^2, 1/sec^3, (kg/m^3)*(m^2/s^2), (kg/m^3)*(m^2/s^2), 1/sec^4, days since 1900-01-01 00:... \n", + "1 [K, K, psu, yr, m/sec, m/sec, m/sec, m, kg/m^3, kg/m^3, kg/s, kg/s, kg/s, kg/s, kg/s, kg/s, Watts, Watts, 1/s^2, 1/sec^3, (kg/m^3)*(m^2/s^2), (kg/m^3)*(m^2/s^2), 1/sec^4, days since 1900-01-01 00:... \n", + "2 [K, K, psu, yr, m/sec, m/sec, m/sec, m, kg/m^3, kg/m^3, kg/s, kg/s, kg/s, kg/s, kg/s, kg/s, Watts, Watts, 1/s^2, 1/sec^3, (kg/m^3)*(m^2/s^2), (kg/m^3)*(m^2/s^2), 1/sec^4, days since 1900-01-01 00:... \n", + "\n", + " filename file_id \n", + "0 ocean.nc ocean \n", + "1 ocean.nc ocean \n", + "2 ocean.nc ocean " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We can turn our datastore into a pandas dataframe to look at it in more detail with the .df method:\n", + "temp_ds_1990s.df.head(3)\n", + "# This confirms that we have the dates we wanted. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "81335536-143c-4379-8293-e7c418a4f9eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

01deg_jra55v13_ryf9091 catalog with 1 dataset(s) from 40 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
path40
realm1
variable27
frequency1
start_date40
end_date40
variable_long_name27
variable_standard_name11
variable_cell_methods2
variable_units14
filename1
file_id1
derived_variable0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# It turns out this only gives us one dataset, with 196 files - lets load it and take a look.\n", + "temp_ds_1990s" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4f546fc4-d0f5-4a33-96dd-8d9c38801f2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 350GB\n",
+       "Dimensions:   (time: 120, st_ocean: 75, yt_ocean: 2700, xt_ocean: 3600)\n",
+       "Coordinates:\n",
+       "  * xt_ocean  (xt_ocean) float64 29kB -279.9 -279.8 -279.7 ... 79.75 79.85 79.95\n",
+       "  * yt_ocean  (yt_ocean) float64 22kB -81.11 -81.07 -81.02 ... 89.89 89.94 89.98\n",
+       "  * st_ocean  (st_ocean) float64 600B 0.5413 1.681 2.94 ... 5.511e+03 5.709e+03\n",
+       "  * time      (time) object 960B 1990-01-16 12:00:00 ... 1999-12-16 12:00:00\n",
+       "Data variables:\n",
+       "    temp      (time, st_ocean, yt_ocean, xt_ocean) float32 350GB dask.array<chunksize=(1, 7, 300, 400), meta=np.ndarray>\n",
+       "Attributes: (12/16)\n",
+       "    filename:                                 ocean.nc\n",
+       "    title:                                    ACCESS-OM2-01\n",
+       "    grid_type:                                mosaic\n",
+       "    grid_tile:                                1\n",
+       "    intake_esm_vars:                          ['temp']\n",
+       "    intake_esm_attrs:realm:                   ocean\n",
+       "    ...                                       ...\n",
+       "    intake_esm_attrs:variable_cell_methods:   time: mean,time: mean,time: mea...\n",
+       "    intake_esm_attrs:variable_units:          K,K,psu,yr,m/sec,m/sec,m/sec,m,...\n",
+       "    intake_esm_attrs:filename:                ocean.nc\n",
+       "    intake_esm_attrs:file_id:                 ocean\n",
+       "    intake_esm_attrs:_data_format_:           netcdf\n",
+       "    intake_esm_dataset_key:                   ocean.1mon
" + ], + "text/plain": [ + " Size: 350GB\n", + "Dimensions: (time: 120, st_ocean: 75, yt_ocean: 2700, xt_ocean: 3600)\n", + "Coordinates:\n", + " * xt_ocean (xt_ocean) float64 29kB -279.9 -279.8 -279.7 ... 79.75 79.85 79.95\n", + " * yt_ocean (yt_ocean) float64 22kB -81.11 -81.07 -81.02 ... 89.89 89.94 89.98\n", + " * st_ocean (st_ocean) float64 600B 0.5413 1.681 2.94 ... 5.511e+03 5.709e+03\n", + " * time (time) object 960B 1990-01-16 12:00:00 ... 1999-12-16 12:00:00\n", + "Data variables:\n", + " temp (time, st_ocean, yt_ocean, xt_ocean) float32 350GB dask.array\n", + "Attributes: (12/16)\n", + " filename: ocean.nc\n", + " title: ACCESS-OM2-01\n", + " grid_type: mosaic\n", + " grid_tile: 1\n", + " intake_esm_vars: ['temp']\n", + " intake_esm_attrs:realm: ocean\n", + " ... ...\n", + " intake_esm_attrs:variable_cell_methods: time: mean,time: mean,time: mea...\n", + " intake_esm_attrs:variable_units: K,K,psu,yr,m/sec,m/sec,m/sec,m,...\n", + " intake_esm_attrs:filename: ocean.nc\n", + " intake_esm_attrs:file_id: ocean\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: ocean.1mon" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If we have one dataset in our search results, we can use `.to_dask()` on our dataset to load it.\n", + "temp_ds_1990s.to_dask()" + ] + }, + { + "cell_type": "markdown", + "id": "4efc0cba-8ee0-4aef-88f1-e5bd9a39b22d", + "metadata": {}, + "source": [ + "If we look at this output, we an see that our entire array is 235 GiB, but we have 688 'chunks', each of which are 2.57 MiB. This means that when we load our dataset, dask will have to combine 106920 __very small__ chunks. Things would be much faster if it didn't have to do that: so lets tell dask the chunks we want to use.\n", + "\n", + "This is also the cause of the warnings above (NB. lots of them have been cleared for readability)\n", + "\n", + "___\n", + "### What is a Chunk?\n", + "\n", + "Dask is designed to handle *distributed* datasets. What this means is essentially it can load lots of files (more than would fit in memory), operate on them separately, and then combine them all later to get a result that fits in memory. \n", + "\n", + "In this instance, our chunks are way smaller than they need to be - and this is inefficient \n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3e2d0b09-d3ef-4bbe-95ac-293177a3ba73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 350GB\n",
+       "Dimensions:   (time: 120, st_ocean: 75, yt_ocean: 2700, xt_ocean: 3600)\n",
+       "Coordinates:\n",
+       "  * xt_ocean  (xt_ocean) float64 29kB -279.9 -279.8 -279.7 ... 79.75 79.85 79.95\n",
+       "  * yt_ocean  (yt_ocean) float64 22kB -81.11 -81.07 -81.02 ... 89.89 89.94 89.98\n",
+       "  * st_ocean  (st_ocean) float64 600B 0.5413 1.681 2.94 ... 5.511e+03 5.709e+03\n",
+       "  * time      (time) object 960B 1990-01-16 12:00:00 ... 1999-12-16 12:00:00\n",
+       "Data variables:\n",
+       "    temp      (time, st_ocean, yt_ocean, xt_ocean) float32 350GB dask.array<chunksize=(1, 75, 900, 1200), meta=np.ndarray>\n",
+       "Attributes: (12/16)\n",
+       "    filename:                                 ocean.nc\n",
+       "    title:                                    ACCESS-OM2-01\n",
+       "    grid_type:                                mosaic\n",
+       "    grid_tile:                                1\n",
+       "    intake_esm_vars:                          ['temp']\n",
+       "    intake_esm_attrs:realm:                   ocean\n",
+       "    ...                                       ...\n",
+       "    intake_esm_attrs:variable_cell_methods:   time: mean,time: mean,time: mea...\n",
+       "    intake_esm_attrs:variable_units:          K,K,psu,yr,m/sec,m/sec,m/sec,m,...\n",
+       "    intake_esm_attrs:filename:                ocean.nc\n",
+       "    intake_esm_attrs:file_id:                 ocean\n",
+       "    intake_esm_attrs:_data_format_:           netcdf\n",
+       "    intake_esm_dataset_key:                   ocean.1mon
" + ], + "text/plain": [ + " Size: 350GB\n", + "Dimensions: (time: 120, st_ocean: 75, yt_ocean: 2700, xt_ocean: 3600)\n", + "Coordinates:\n", + " * xt_ocean (xt_ocean) float64 29kB -279.9 -279.8 -279.7 ... 79.75 79.85 79.95\n", + " * yt_ocean (yt_ocean) float64 22kB -81.11 -81.07 -81.02 ... 89.89 89.94 89.98\n", + " * st_ocean (st_ocean) float64 600B 0.5413 1.681 2.94 ... 5.511e+03 5.709e+03\n", + " * time (time) object 960B 1990-01-16 12:00:00 ... 1999-12-16 12:00:00\n", + "Data variables:\n", + " temp (time, st_ocean, yt_ocean, xt_ocean) float32 350GB dask.array\n", + "Attributes: (12/16)\n", + " filename: ocean.nc\n", + " title: ACCESS-OM2-01\n", + " grid_type: mosaic\n", + " grid_tile: 1\n", + " intake_esm_vars: ['temp']\n", + " intake_esm_attrs:realm: ocean\n", + " ... ...\n", + " intake_esm_attrs:variable_cell_methods: time: mean,time: mean,time: mea...\n", + " intake_esm_attrs:variable_units: K,K,psu,yr,m/sec,m/sec,m/sec,m,...\n", + " intake_esm_attrs:filename: ocean.nc\n", + " intake_esm_attrs:file_id: ocean\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: ocean.1mon" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We have 120 files - one for each time step. Because we have lots of data in the x & y dimensions, we will\n", + "# still need to split them up on these\n", + "# Luckily, we can pass `.to_dask()` a chunk specification - and make sure we get sensible sized chunks.\n", + "\n", + "# As a rule of thumb, 300MiB chunks are a good target.\n", + "\n", + "chunk_spec = {\n", + " # 'time' : 120, We dont need to pass a time chunk - it defaults to one per file anyway - but this is how we would tell it to.\n", + " 'st_ocean' : -1, # We can also use -1 to mean '1 chunk for the whole dimension - we could have done this for time too & it would be the exact same.\n", + " 'yt_ocean' : 900, \n", + " 'xt_ocean': 1200\n", + " \n", + "}\n", + "\n", + "temp_ds_1990s.to_dask(\n", + " xarray_open_kwargs = {'chunks': chunk_spec} \n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "a39e7ff1-9d6e-4d39-8126-cc73adb4a6ad", + "metadata": {}, + "source": [ + "## Now we have more sensible chunks, we can easily operate on our data. \n", + "### Lets average over time first to get a climatological state, & then plot the zonal mean temperature structure around the Southern Ocean.\n", + "\n", + "This all uses standard xarray operations now, which you are hopefully familiar with.\n", + "\n", + "For a brief intro to xarray, see the excellent notes at https://cmip6moap.github.io/resources/loading-data-xarray/" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0f2326d4-dae1-4b30-923b-831b3061d03b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "xr_ds = temp_ds_1990s.to_dask(\n", + " xarray_open_kwargs = {'chunks' : chunk_spec}\n", + ")\n", + "\n", + "SOU_OCN_DS = xr_ds.sel(yt_ocean=slice(-90,-40)) # Just get south of 40 South\n", + "\n", + "SOU_ZM = SOU_OCN_DS.mean(dim=['time','xt_ocean']) # Average over time & longitude\n", + "\n", + "SOU_ZM['temp_c'] = SOU_ZM['temp'] - 273\n", + "\n", + "# Create the plot\n", + "fig, ax = plt.subplots()\n", + "\n", + "SOU_ZM['temp_c'].plot(yincrease=False, ax = ax)\n", + "\n", + "# Set the background color\n", + "ax.set_facecolor('lightgrey')\n", + "\n", + "plt.show()\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:analysis3-24.04] *", + "language": "python", + "name": "conda-env-analysis3-24.04-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}