From 1ff11e740d72d1749eee0b78b6fc03d028ef2ed5 Mon Sep 17 00:00:00 2001 From: psmsong Date: Wed, 12 Jun 2024 18:31:33 -0700 Subject: [PATCH 1/3] start argovis section --- notebooks/argo-access.ipynb | 734 ++++++++---------------------------- 1 file changed, 156 insertions(+), 578 deletions(-) diff --git a/notebooks/argo-access.ipynb b/notebooks/argo-access.ipynb index 4ba7e5d..98ba9be 100644 --- a/notebooks/argo-access.ipynb +++ b/notebooks/argo-access.ipynb @@ -33,15 +33,18 @@ "source": [ "## Overview\n", "\n", - "Building upon previous notebook, [Introduction to Argo](notebooks/argo-introduction.ipynb), we next explore how to access Argo data in a few different ways. \n", + "Building upon previous notebook, [Introduction to Argo](notebooks/argo-introduction.ipynb), we next explore how to access Argo data using various methods.\n", "\n", - "1. Data formats for Argo profiles\n", - "2. Downloading [monthly snapshots](http://www.argodatamgt.org/Access-to-data/Argo-DOI-Digital-Object-Identifier) using Argo DOI's\n", + "These methods are described fully on their respective websites, linked below. Our goal here is to provide a brief overview of some of the different tools available. \n", + "\n", + "1. Introducing data formats for Argo profiles\n", + "2. Using [Argopy](https://argopy.readthedocs.io/en/latest/user-guide/fetching-argo-data/index.html), a dedicated Python package\n", "3. Using [Argovis](https://argovis.colorado.edu/argo) for API-based queries \n", - "4. Using the [GO-BGC Toolbox](https://github.com/go-bgc/workshop-python)\n", - "5. Using [Argopy](https://argopy.readthedocs.io/en/latest/user-guide/fetching-argo-data/index.html), a dedicated Python package\n", "\n", - "After going through this notebook, you will be able to retrieve Argo data of interest within a certain time frame, geographical location, or by platform identifier. There are many ways of working with Argo data which are not described here. \n", + "\n", + "\n", + "\n", + "After going through this notebook, you will be able to retrieve Argo data of interest within a certain time frame, geographical location, or by platform identifier. There are many other ways of working with Argo data, so we encourage users to explore what applications work best for their needs. \n", "Further information on Argo access can be found on the [Argo website](https://argo.ucsd.edu/data/)." ] }, @@ -95,6 +98,7 @@ "import matplotlib.pyplot as plt\n", "import matplotlib.colors as mcolors\n", "import seaborn as sns\n", + "from cmocean import cm as cmo\n", "\n", "from argovisHelpers import helpers as avh" ] @@ -103,12 +107,38 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Snapshots" + "## Background: Common xarray formats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Using GO-BGC Toolbox GDAC function\n", + "\n", + "We recommend this tool for users who only need a few profiles in a specific area of interest. \n", + "Considerations: \n", + "- Easy to use and understand\n", + "- Downloads synthetic profiles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Argo profiles are " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -119,579 +149,60 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "DSdict = {}\n", - "for filename in os.listdir(profile_dir):\n", - " if filename.endswith(\".nc\"):\n", - " fp = profile_dir + filename\n", - " single_dataset = xr.open_dataset(fp, decode_times=False)\n", - " DSdict[filename[0:7]] = single_dataset\n", - "# DSdict['5906030']" + "# DSdict = {}\n", + "# for filename in os.listdir(profile_dir):\n", + "# if filename.endswith(\".nc\"):\n", + "# fp = profile_dir + filename\n", + "# single_dataset = xr.open_dataset(fp, decode_times=False)\n", + "# DSdict[filename[0:7]] = single_dataset\n", + "# # DSdict['5906030']" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 2MB\n",
-       "Dimensions:              (N_PROF: 117, N_LEVELS: 69, NPARAMETER: 42)\n",
-       "Dimensions without coordinates: N_PROF, N_LEVELS, NPARAMETER\n",
-       "Data variables: (12/60)\n",
-       "    Cruise               |S11 11B ...\n",
-       "    Station              (N_PROF) int32 468B ...\n",
-       "    Lon                  (N_PROF) float64 936B ...\n",
-       "    Lat                  (N_PROF) float64 936B ...\n",
-       "    Lat_QF               (N_PROF) |S1 117B ...\n",
-       "    Lat_QFA              (N_PROF) float64 936B ...\n",
-       "    ...                   ...\n",
-       "    Type                 |S1 1B ...\n",
-       "    mon_day_yr           (N_PROF) |S10 1kB ...\n",
-       "    hh_mm                (N_PROF) |S5 585B ...\n",
-       "    Parameters           (NPARAMETER) |S19 798B ...\n",
-       "    JULD                 (N_PROF) float64 936B ...\n",
-       "    REFERENCE_DATE_TIME  object 8B ...\n",
-       "Attributes:\n",
-       "    Comments:  \\n//0\\n//<Encoding>UTF-8</Encoding>\\n//File updated on 08/26/2...
" - ], - "text/plain": [ - " Size: 2MB\n", - "Dimensions: (N_PROF: 117, N_LEVELS: 69, NPARAMETER: 42)\n", - "Dimensions without coordinates: N_PROF, N_LEVELS, NPARAMETER\n", - "Data variables: (12/60)\n", - " Cruise |S11 11B ...\n", - " Station (N_PROF) int32 468B ...\n", - " Lon (N_PROF) float64 936B ...\n", - " Lat (N_PROF) float64 936B ...\n", - " Lat_QF (N_PROF) |S1 117B ...\n", - " Lat_QFA (N_PROF) float64 936B ...\n", - " ... ...\n", - " Type |S1 1B ...\n", - " mon_day_yr (N_PROF) |S10 1kB ...\n", - " hh_mm (N_PROF) |S5 585B ...\n", - " Parameters (NPARAMETER) |S19 798B ...\n", - " JULD (N_PROF) float64 936B ...\n", - " REFERENCE_DATE_TIME object 8B ...\n", - "Attributes:\n", - " Comments: \\n//0\\n//UTF-8\\n//File updated on 08/26/2..." - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "DSdict['5906007']\n" - ] + "outputs": [], + "source": [] }, { - "cell_type": "code", - "execution_count": 11, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'/Users/sangminsong/Library/CloudStorage/OneDrive-UW/Code/2024_Pythia/SOCCOM_GO-BGC_LoResQC_LIAR_28Aug2023_netcdf/'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "profile_dir" + "## Using the `Argopy` Python Package" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "unrecognized chunk manager dask - must be one of: []", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mxr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_mfdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/Users/sangminsong/Library/CloudStorage/OneDrive-UW/Code/2024_Pythia/SOCCOM_GO-BGC_LoResQC_LIAR_28Aug2023_netcdf/*.nc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/sklearn-argo-dev/lib/python3.9/site-packages/xarray/backends/api.py:1054\u001b[0m, in \u001b[0;36mopen_mfdataset\u001b[0;34m(paths, chunks, concat_dim, compat, preprocess, engine, data_vars, coords, combine, parallel, join, attrs_file, combine_attrs, **kwargs)\u001b[0m\n\u001b[1;32m 1051\u001b[0m open_ \u001b[38;5;241m=\u001b[39m open_dataset\n\u001b[1;32m 1052\u001b[0m getattr_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m\n\u001b[0;32m-> 1054\u001b[0m datasets \u001b[38;5;241m=\u001b[39m [open_(p, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mopen_kwargs) \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m paths]\n\u001b[1;32m 1055\u001b[0m closers \u001b[38;5;241m=\u001b[39m [getattr_(ds, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_close\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m datasets]\n\u001b[1;32m 1056\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preprocess \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/sklearn-argo-dev/lib/python3.9/site-packages/xarray/backends/api.py:1054\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1051\u001b[0m open_ \u001b[38;5;241m=\u001b[39m open_dataset\n\u001b[1;32m 1052\u001b[0m getattr_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m\n\u001b[0;32m-> 1054\u001b[0m datasets \u001b[38;5;241m=\u001b[39m [\u001b[43mopen_\u001b[49m\u001b[43m(\u001b[49m\u001b[43mp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mopen_kwargs\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m paths]\n\u001b[1;32m 1055\u001b[0m closers \u001b[38;5;241m=\u001b[39m [getattr_(ds, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_close\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m datasets]\n\u001b[1;32m 1056\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preprocess \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/sklearn-argo-dev/lib/python3.9/site-packages/xarray/backends/api.py:577\u001b[0m, in \u001b[0;36mopen_dataset\u001b[0;34m(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 570\u001b[0m overwrite_encoded_chunks \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverwrite_encoded_chunks\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 571\u001b[0m backend_ds \u001b[38;5;241m=\u001b[39m backend\u001b[38;5;241m.\u001b[39mopen_dataset(\n\u001b[1;32m 572\u001b[0m filename_or_obj,\n\u001b[1;32m 573\u001b[0m drop_variables\u001b[38;5;241m=\u001b[39mdrop_variables,\n\u001b[1;32m 574\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mdecoders,\n\u001b[1;32m 575\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 576\u001b[0m )\n\u001b[0;32m--> 577\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43m_dataset_from_backend_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 578\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackend_ds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_or_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 580\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 581\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 582\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 583\u001b[0m \u001b[43m \u001b[49m\u001b[43moverwrite_encoded_chunks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 584\u001b[0m \u001b[43m \u001b[49m\u001b[43minline_array\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 585\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked_array_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 586\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrom_array_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 587\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdecoders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 590\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/sklearn-argo-dev/lib/python3.9/site-packages/xarray/backends/api.py:370\u001b[0m, in \u001b[0;36m_dataset_from_backend_dataset\u001b[0;34m(backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, inline_array, chunked_array_type, from_array_kwargs, **extra_tokens)\u001b[0m\n\u001b[1;32m 368\u001b[0m ds \u001b[38;5;241m=\u001b[39m backend_ds\n\u001b[1;32m 369\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 370\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43m_chunk_ds\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackend_ds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_or_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43moverwrite_encoded_chunks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 376\u001b[0m \u001b[43m \u001b[49m\u001b[43minline_array\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 377\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked_array_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 378\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrom_array_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 379\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mextra_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 382\u001b[0m ds\u001b[38;5;241m.\u001b[39mset_close(backend_ds\u001b[38;5;241m.\u001b[39m_close)\n\u001b[1;32m 384\u001b[0m \u001b[38;5;66;03m# Ensure source filename always stored in dataset object\u001b[39;00m\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/sklearn-argo-dev/lib/python3.9/site-packages/xarray/backends/api.py:318\u001b[0m, in \u001b[0;36m_chunk_ds\u001b[0;34m(backend_ds, filename_or_obj, engine, chunks, overwrite_encoded_chunks, inline_array, chunked_array_type, from_array_kwargs, **extra_tokens)\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_chunk_ds\u001b[39m(\n\u001b[1;32m 308\u001b[0m backend_ds,\n\u001b[1;32m 309\u001b[0m filename_or_obj,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mextra_tokens,\n\u001b[1;32m 317\u001b[0m ):\n\u001b[0;32m--> 318\u001b[0m chunkmanager \u001b[38;5;241m=\u001b[39m \u001b[43mguess_chunkmanager\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunked_array_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;66;03m# TODO refactor to move this dask-specific logic inside the DaskManager class\u001b[39;00m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(chunkmanager, DaskManager):\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/sklearn-argo-dev/lib/python3.9/site-packages/xarray/namedarray/parallelcompat.py:117\u001b[0m, in \u001b[0;36mguess_chunkmanager\u001b[0;34m(manager)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(manager, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m manager \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m chunkmanagers:\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munrecognized chunk manager \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmanager\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m - must be one of: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(chunkmanagers)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 119\u001b[0m )\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m chunkmanagers[manager]\n\u001b[1;32m 122\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(manager, ChunkManagerEntrypoint):\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# already a valid ChunkManager so just pass through\u001b[39;00m\n", - "\u001b[0;31mValueError\u001b[0m: unrecognized chunk manager dask - must be one of: []" - ] - } - ], - "source": [ - "# xr.open_mfdataset(\"/Users/sangminsong/Library/CloudStorage/OneDrive-UW/Code/2024_Pythia/SOCCOM_GO-BGC_LoResQC_LIAR_28Aug2023_netcdf/*.nc\")" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Argovis" + "## Querying Data with `Argovis`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Argovis provides an API that allows us to interact with Argo data while only downloading the exact subsets of data needed for analysis. " + "Argovis provides an API that allows us to interact with Argo data while only downloading the exact subsets of data needed for analysis. \n", + "Our examples here are modified from the [tutorial notebooks](https://github.com/argovis/demo_notebooks) released by Argovis. We showcase only a few of the functionalities, but more information can be found in the previous link.\n", + "\n", + "The introduction published by Argovis:\n", + ">\"Argovis is a REST API and web application for searching, downloading, co-locating and visualizing oceanographic data, including Argo array data, ship-based profile data, data from the Global Drifter Program, tropical cyclone data, and several gridded products. Our API is meant to be integrated into living documents like Jupyter notebooks and analyses intended to update their consumption of Argo data in near-real-time, and our web frontend is intended to make it easy for students and educators to explore data about Earth's oceans at will.\"\n", + "\n", + "Argovis should be cited as:\n", + "\n", + "Tucker, T., D. Giglio, M. Scanderbeg, and S.S.P. Shen: Argovis: A Web Application for Fast Delivery, Visualization, and Analysis of Argo Data. J. Atmos. Oceanic Technol., 37, 401–416, https://doi.org/10.1175/JTECH-D-19-0041.1\n" ] }, { @@ -705,8 +216,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will be using the Argovis capabilities\n", - "\n", "From the Argovis tutorial: \n", "> In order to allocate Argovis's limited computing resources fairly, users are encouraged to register and request a free API key. This works like a password that identifies your requests to Argovis. To do so:\n", ">\n", @@ -733,7 +242,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We set up our query parameters using a Dictionary before requesting data: " + "### Getting Argo data documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before actually getting Argo measurements, we can query information about the profile (including pointers to the metadata)." ] }, { @@ -790,20 +306,9 @@ "argoProfiles[0]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the first object in argoProfiles is a single vertical Argo \"profile\". \n", - "The first 7 digits of `argoProfiles[0]['_id']` refer to a float's WMO unique identification number. \n", - "The last three digits are the profile number. \n", - "\n", - "In the above example, we are looking at data from the 256th profile from float WMO #1901820." - ] - }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -812,7 +317,7 @@ "'1901820_256'" ] }, - "execution_count": 13, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -825,12 +330,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also request metadata using the argo/meta " + "Note that the first object in argoProfiles is a single vertical Argo \"profile\". \n", + "The first 7 digits of `argoProfiles[0]['_id']` refer to a float's WMO unique identification number. \n", + "The last three digits are the profile number. \n", + "\n", + "In the above example, we are looking at data from the 256th profile from float WMO #1901820." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can get more information about this particular float by querying `argo/meta`." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -849,7 +365,7 @@ " 'wmo_inst_type': '854'}]" ] }, - "execution_count": 15, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -858,14 +374,20 @@ "metaOptions = {\n", " 'id': argoProfiles[0]['metadata'][0]\n", "}\n", - "\n", "argoMeta = avh.query('argo/meta', options=metaOptions, apikey=API_KEY, apiroot=API_ROOT)\n", "argoMeta" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also specify all of the profiles taken from the same float with WMO ID 1901820." + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -896,18 +418,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can query float profiles within these bounds: " + "Now, we want to retrieve actual measurements. We can use any number of identifiers. \n", + "\n", + "Below, we are specifying float WMO 4901283 and profile #003. The `data` variable can be:\n", + "\n", + "- A comma separated list of variable names, e.g. `'temperature, doxy'`\n", + "- `'all'`, meaning get all available variables. " ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "dataQuery = {\n", - " 'startDate': '2017-08-01T00:00:00Z',\n", - " 'endDate': '2017-09-01T00:00:00Z',\n", + " 'id': '4901283_003',\n", + " 'data': 'all'\n", + "}\n", + "profile = avh.query('argo', options=dataQuery, apikey=API_KEY, apiroot=API_ROOT)\n", + "# avh.data_inflate(profile[0])[0:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can query float profiles within larger bounds: " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "dataQuery = {\n", + " 'startDate': '2020-01-01T00:00:00Z',\n", + " 'endDate': '2024-01-01T00:00:00Z',\n", " 'polygon': [[-150,-30],[-155,-30],[-155,-35],[-150,-35],[-150,-30]],\n", " 'data': 'doxy'\n", "}\n", @@ -945,6 +493,36 @@ "inflated_data[0:10]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Querying within geospatial bounds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qs = {\n", + " 'startDate': '2017-08-01T00:00:00Z',\n", + " 'endDate': '2017-09-01T00:00:00Z',\n", + " 'box': [[-20,70],[20,72]]\n", + "}\n", + "\n", + "profiles = avh.query('argo', options=qs, apikey=API_KEY, apiroot=API_ROOT)\n", + "latitudes = [x['geolocation']['coordinates'][1] for x in profiles]\n", + "print(min(latitudes))\n", + "print(max(latitudes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "metadata": {}, From 6ea201340682f101d53964378b0a3ba89ba44343 Mon Sep 17 00:00:00 2001 From: psmsong Date: Thu, 13 Jun 2024 08:20:35 -0700 Subject: [PATCH 2/3] edit argovis section --- notebooks/argo-access.ipynb | 260 ++++++++++++++++++++++++++++++++---- 1 file changed, 233 insertions(+), 27 deletions(-) diff --git a/notebooks/argo-access.ipynb b/notebooks/argo-access.ipynb index 98ba9be..64029d0 100644 --- a/notebooks/argo-access.ipynb +++ b/notebooks/argo-access.ipynb @@ -35,11 +35,11 @@ "\n", "Building upon previous notebook, [Introduction to Argo](notebooks/argo-introduction.ipynb), we next explore how to access Argo data using various methods.\n", "\n", - "These methods are described fully on their respective websites, linked below. Our goal here is to provide a brief overview of some of the different tools available. \n", + "These methods are described in more detail on their respective websites, linked below. Our goal here is to provide a brief overview of some of the different tools available. \n", "\n", - "1. Introducing data formats for Argo profiles\n", - "2. Using [Argopy](https://argopy.readthedocs.io/en/latest/user-guide/fetching-argo-data/index.html), a dedicated Python package\n", - "3. Using [Argovis](https://argovis.colorado.edu/argo) for API-based queries \n", + "1. [GO-BGC Toolbox](https://github.com/go-bgc/workshop-python) \n", + "2. [Argopy](https://argopy.readthedocs.io/en/latest/user-guide/fetching-argo-data/index.html), a dedicated Python package\n", + "3. [Argovis](https://argovis.colorado.edu/argo) for API-based queries \n", "\n", "\n", "\n", @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -95,6 +95,11 @@ "import xarray as xr\n", "from datetime import datetime, timedelta\n", "\n", + "import requests\n", + "import time\n", + "import urllib3\n", + "import shutil\n", + "\n", "import matplotlib.pyplot as plt\n", "import matplotlib.colors as mcolors\n", "import seaborn as sns\n", @@ -107,34 +112,229 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Background: Common xarray formats" + "## 1. Downloading with the GO-BGC Toolbox\n", + "\n", + "In the previous notebook, [Introduction to Argo](notebooks/argo-introduction.ipynb), we saw how Argo synthetic profile ('[sprof](https://archimer.ifremer.fr/doc/00445/55637/)') data is stored in netcdf4 format.\n", + "\n", + "Using the GDAC function allows you to subset and download Sprof's for multiple floats. \n", + "We recommend this tool for users who only need a few profilesd in a specific area of interest. \n", + "Considerations: \n", + "- Easy to use and understand\n", + "- Downloads float data as individual .nc files to your local machine (takes up storage space)\n", + "- Must download all variables available (cannot subset only variables of interest)\n", + "\n", + "The two major functions below are courtesy of the [GO-BGC Toolbox](https://github.com/go-bgc/workshop-python) (Ethan Campbell). A full tutorial is available in the Toolbox.\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 65, "metadata": {}, + "outputs": [], "source": [ - "## 1. Using GO-BGC Toolbox GDAC function\n", + "# # Base filepath. Need for Argo GDAC function.z\n", + "# root = '/Users/sangminsong/Library/CloudStorage/OneDrive-UW/Code/2024_Pythia/'\n", + "# profile_dir = root + 'SOCCOM_GO-BGC_LoResQC_LIAR_28Aug2023_netcdf/'\n", "\n", - "We recommend this tool for users who only need a few profiles in a specific area of interest. \n", - "Considerations: \n", - "- Easy to use and understand\n", - "- Downloads synthetic profiles" + "# # Base filepath. Need for Argo GDAC function.\n", + "root = '../data/'\n", + "profile_dir = root + 'bgc-argo/'" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Argo profiles are " + "### 1.0 GO-BGC Toolbox Functions" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Function to download a single file (From GO-BGC Toolbox)\n", + "def download_file(url_path,filename,save_to=None,overwrite=False,verbose=True):\n", + " \"\"\" Downloads and saves a file from a given URL using HTTP protocol.\n", + "\n", + " Note: If '404 file not found' error returned, function will return without downloading anything.\n", + " \n", + " Arguments:\n", + " url_path: root URL to download from including trailing slash ('/')\n", + " filename: filename to download including suffix\n", + " save_to: None (to download to root Google Drive GO-BGC directory)\n", + " or directory path\n", + " overwrite: False to leave existing files in place\n", + " or True to overwrite existing files\n", + " verbose: True to announce progress\n", + " or False to stay silent\n", + " \n", + " \"\"\"\n", + " urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", + "\n", + " if save_to is None:\n", + " save_to = root #profile_dir # EDITED HERE\n", + "\n", + " try:\n", + " if filename in os.listdir(save_to):\n", + " if not overwrite:\n", + " if verbose: print('>>> File ' + filename + ' already exists. Leaving current version.')\n", + " return\n", + " else:\n", + " if verbose: print('>>> File ' + filename + ' already exists. Overwriting with new version.')\n", + "\n", + " def get_func(url,stream=True):\n", + " try:\n", + " return requests.get(url,stream=stream,auth=None,verify=False)\n", + " except requests.exceptions.ConnectionError as error_tag:\n", + " print('Error connecting:',error_tag)\n", + " time.sleep(1)\n", + " return get_func(url,stream=stream)\n", + "\n", + " response = get_func(url_path + filename,stream=True)\n", + "\n", + " if response.status_code == 404:\n", + " if verbose: print('>>> File ' + filename + ' returned 404 error during download.')\n", + " return\n", + " with open(save_to + filename,'wb') as out_file:\n", + " shutil.copyfileobj(response.raw,out_file)\n", + " del response\n", + " if verbose: print('>>> Successfully downloaded ' + filename + '.')\n", + "\n", + " except:\n", + " if verbose: print('>>> An error occurred while trying to download ' + filename + '.')" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to download and parse GDAC synthetic profile index file (GO-BGC Toolbox)\n", + "def argo_gdac(lat_range=None,lon_range=None,start_date=None,end_date=None,sensors=None,floats=None,\n", + " overwrite_index=False,overwrite_profiles=False,skip_download=False,\n", + " download_individual_profs=False,save_to=None,verbose=True):\n", + " \"\"\" Downloads GDAC Sprof index file, then selects float profiles based on criteria.\n", + " Either returns information on profiles and floats (if skip_download=True) or downloads them (if False).\n", + "\n", + " Arguments:\n", + " lat_range: None, to select all latitudes\n", + " or [lower, upper] within -90 to 90 (selection is inclusive)\n", + " lon_range: None, to select all longitudes\n", + " or [lower, upper] within either -180 to 180 or 0 to 360 (selection is inclusive)\n", + " NOTE: longitude range is allowed to cross -180/180 or 0/360\n", + " start_date: None or datetime object\n", + " end_date: None or datetime object\n", + " sensors: None, to select profiles with any combination of sensors\n", + " or string or list of strings to specify required sensors\n", + " > note that common options include PRES, TEMP, PSAL, DOXY, CHLA, BBP700,\n", + " PH_IN_SITU_TOTAL, and NITRATE\n", + " floats: None, to select any floats matching other criteria\n", + " or int or list of ints specifying floats' WMOID numbers\n", + " overwrite_index: False to keep existing downloaded GDAC index file, or True to download new index\n", + " overwrite_profiles: False to keep existing downloaded profile files, or True to download new files\n", + " skip_download: True to skip download and return: (, ,\n", + " )\n", + " or False to download those profiles\n", + " download_individual_profs: False to download single Sprof file containing all profiles for each float\n", + " or True to download individual profile files for each float\n", + " save_to: None to download to Google Drive \"/GO-BGC Workshop/Profiles\" directory\n", + " or string to specify directory path for profile downloads\n", + " verbose: True to announce progress, or False to stay silent\n", + "\n", + " \"\"\"\n", + " # Paths\n", + " url_root = 'https://www.usgodae.org/ftp/outgoing/argo/'\n", + " dac_url_root = url_root + 'dac/'\n", + " index_filename = 'argo_synthetic-profile_index.txt'\n", + " if save_to is None: save_to = root\n", + "\n", + " # Download GDAC synthetic profile index file\n", + " download_file(url_root,index_filename,overwrite=overwrite_index)\n", + "\n", + " # Load index file into Pandas DataFrame\n", + " gdac_index = pd.read_csv(root + index_filename,delimiter=',',header=8,parse_dates=['date','date_update'],\n", + " date_parser=lambda x: pd.to_datetime(x,format='%Y%m%d%H%M%S'))\n", + "\n", + " # Establish time and space criteria\n", + " if lat_range is None: lat_range = [-90.0,90.0]\n", + " if lon_range is None: lon_range = [-180.0,180.0]\n", + " elif lon_range[0] > 180 or lon_range[1] > 180:\n", + " if lon_range[0] > 180: lon_range[0] -= 360\n", + " if lon_range[1] > 180: lon_range[1] -= 360\n", + " if start_date is None: start_date = datetime(1900,1,1)\n", + " if end_date is None: end_date = datetime(2200,1,1)\n", + "\n", + " float_wmoid_regexp = r'[a-z]*/[0-9]*/profiles/[A-Z]*([0-9]*)_[0-9]*[A-Z]*.nc'\n", + " gdac_index['wmoid'] = gdac_index['file'].str.extract(float_wmoid_regexp).astype(int)\n", + " filepath_main_regexp = '([a-z]*/[0-9]*/)profiles/[A-Z]*[0-9]*_[0-9]*[A-Z]*.nc'\n", + " gdac_index['filepath_main'] = gdac_index['file'].str.extract(filepath_main_regexp)\n", + " filepath_regexp = '([a-z]*/[0-9]*/profiles/)[A-Z]*[0-9]*_[0-9]*[A-Z]*.nc'\n", + " gdac_index['filepath'] = gdac_index['file'].str.extract(filepath_regexp)\n", + " filename_regexp = '[a-z]*/[0-9]*/profiles/([A-Z]*[0-9]*_[0-9]*[A-Z]*.nc)'\n", + " gdac_index['filename'] = gdac_index['file'].str.extract(filename_regexp)\n", + "\n", + " # Subset profiles based on time and space criteria\n", + " gdac_index_subset = gdac_index.loc[np.logical_and.reduce([gdac_index['latitude'] >= lat_range[0],\n", + " gdac_index['latitude'] <= lat_range[1],\n", + " gdac_index['date'] >= start_date,\n", + " gdac_index['date'] <= end_date]),:]\n", + " if lon_range[1] >= lon_range[0]: # range does not cross -180/180 or 0/360\n", + " gdac_index_subset = gdac_index_subset.loc[np.logical_and(gdac_index_subset['longitude'] >= lon_range[0],\n", + " gdac_index_subset['longitude'] <= lon_range[1])]\n", + " elif lon_range[1] < lon_range[0]: # range crosses -180/180 or 0/360\n", + " gdac_index_subset = gdac_index_subset.loc[np.logical_or(gdac_index_subset['longitude'] >= lon_range[0],\n", + " gdac_index_subset['longitude'] <= lon_range[1])]\n", + "\n", + " # If requested, subset profiles using float WMOID criteria\n", + " if floats is not None:\n", + " if type(floats) is not list: floats = [floats]\n", + " gdac_index_subset = gdac_index_subset.loc[gdac_index_subset['wmoid'].isin(floats),:]\n", + "\n", + " # If requested, subset profiles using sensor criteria\n", + " if sensors is not None:\n", + " if type(sensors) is not list: sensors = [sensors]\n", + " for sensor in sensors:\n", + " gdac_index_subset = gdac_index_subset.loc[gdac_index_subset['parameters'].str.contains(sensor),:]\n", + "\n", + " # Examine subsetted profiles\n", + " wmoids = gdac_index_subset['wmoid'].unique()\n", + " wmoid_filepaths = gdac_index_subset['filepath_main'].unique()\n", + "\n", + " # Just return list of floats and DataFrame with subset of index file, or download each profile\n", + " if not skip_download:\n", + " downloaded_filenames = []\n", + " if download_individual_profs:\n", + " for p_idx in gdac_index_subset.index:\n", + " download_file(dac_url_root + gdac_index_subset.loc[p_idx]['filepath'],\n", + " gdac_index_subset.loc[p_idx]['filename'],\n", + " save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)\n", + " downloaded_filenames.append(gdac_index_subset.loc[p_idx]['filename'])\n", + " else:\n", + " for f_idx, wmoid_filepath in enumerate(wmoid_filepaths):\n", + " download_file(dac_url_root + wmoid_filepath,str(wmoids[f_idx]) + '_Sprof.nc',\n", + " save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)\n", + " downloaded_filenames.append(str(wmoids[f_idx]) + '_Sprof.nc')\n", + " return wmoids, gdac_index_subset, downloaded_filenames\n", + " else:\n", + " return wmoids, gdac_index_subset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Using GDAC function to access Argo subsets" + ] }, { "cell_type": "code", @@ -142,9 +342,22 @@ "metadata": {}, "outputs": [], "source": [ - "# # Base filepath. Need for Argo GDAC function.\n", - "root = '/Users/sangminsong/Library/CloudStorage/OneDrive-UW/Code/2024_Pythia/'\n", - "profile_dir = root + 'SOCCOM_GO-BGC_LoResQC_LIAR_28Aug2023_netcdf/'" + "# dont download, just get wmoids\n", + "# wmoids, gdac_index = argo_gdac(lat_range=lat_bounds,lon_range=lon_bounds,\n", + "# start_date=start_yd,end_date=end_yd,\n", + "# sensors=None,floats=None,\n", + "# overwrite_index=True,overwrite_profiles=False,\n", + "# skip_download=True,download_individual_profs=False,\n", + "# save_to=profile_dir,verbose=True)\n", + "\n", + "# download specific float #5906030 \n", + "wmoids, gdac_index, downloaded_filenames \\\n", + " = argo_gdac(lat_range=None,lon_range=None,\n", + " start_date=None,end_date=None,\n", + " sensors=None,floats=5906030,\n", + " overwrite_index=True,overwrite_profiles=False,\n", + " skip_download=False,download_individual_profs=False,\n", + " save_to=profile_dir,verbose=True)" ] }, { @@ -162,18 +375,11 @@ "# # DSdict['5906030']" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Using the `Argopy` Python Package" + "## 2. Using the Argopy Python Package" ] }, { @@ -187,7 +393,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Querying Data with `Argovis`" + "## 3. Querying Data with Argovis" ] }, { From dfbc67b01b7584a6cb8018a29aa1ae11217bb216 Mon Sep 17 00:00:00 2001 From: psmsong Date: Thu, 13 Jun 2024 08:53:24 -0700 Subject: [PATCH 3/3] last changes --- notebooks/argo-access.ipynb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/notebooks/argo-access.ipynb b/notebooks/argo-access.ipynb index 64029d0..9e625e1 100644 --- a/notebooks/argo-access.ipynb +++ b/notebooks/argo-access.ipynb @@ -375,6 +375,13 @@ "# # DSdict['5906030']" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {},