diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 9bb0b782..353e993f 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -46,6 +46,9 @@ jobs: run: poetry install - name: Build docs run: poetry run bash scripts/build-docs.sh + env: + EARTHDATA_USERNAME: ${{ secrets.EDL_USERNAME }} + EARTHDATA_PASSWORD: ${{ secrets.EDL_PASSWORD }} - name: Deploy if: | diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml new file mode 100644 index 00000000..8b6352c6 --- /dev/null +++ b/.github/workflows/integration-test.yml @@ -0,0 +1,55 @@ +name: Integration Tests + +on: + push: + branches: + - main + paths: + - earthaccess/** + - tests/** + - docs/** + - binder/** + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8, 3.9, '3.10', '3.11'] + fail-fast: false + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Get full python version + id: full-python-version + run: echo ::set-output name=version::$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") + - name: Install poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.poetry/bin" >> $GITHUB_PATH + - name: Configure poetry + run: poetry config virtualenvs.in-project true + - name: Set up cache + uses: actions/cache@v1 + id: cache + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.full-python-version.outputs.version }}-${{ hashFiles('**/poetry.lock') }} + - name: Ensure cache is healthy + if: steps.cache.outputs.cache-hit == 'true' + run: poetry run pip --version >/dev/null 2>&1 || rm -rf .venv + - name: Install Dependencies + run: poetry install + - name: Test + env: + EARTHDATA_USERNAME: ${{ secrets.EDL_USERNAME }} + EARTHDATA_PASSWORD: ${{ secrets.EDL_PASSWORD }} + EARTHACCESS_TEST_USERNAME: ${{ secrets.EDL_USERNAME }} + EARTHACCESS_TEST_PASSWORD: ${{ secrets.EDL_PASSWORD }} + run: poetry run bash scripts/integration-test.sh + - name: Upload coverage + uses: codecov/codecov-action@v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e1fafed5..67059457 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: Test +name: Unit Tests on: push: @@ -46,11 +46,6 @@ jobs: - name: Install Dependencies run: poetry install - name: Test - env: - EARTHDATA_USERNAME: ${{ secrets.EDL_USERNAME }} - EARTHDATA_PASSWORD: ${{ secrets.EDL_PASSWORD }} - EARTHACCESS_TEST_USERNAME: ${{ secrets.EDL_USERNAME }} - EARTHACCESS_TEST_PASSWORD: ${{ secrets.EDL_PASSWORD }} run: poetry run bash scripts/test.sh - name: Upload coverage uses: codecov/codecov-action@v1 diff --git a/CHANGELOG.md b/CHANGELOG.md index ee42c5d4..ac2b2aea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## Unreleased +* bug fixes: + * granule's size() returned zero + * Added exception handling for fsspec sessions, thanks to @jrbourbeau +* CI changes: + * integration tests are now only run when we push to main (after a merge) + * unit tests run for any branch and opened PR + ## [v0.5.2] 2023-04-21 * bug fixes: * Fixing #230 by removing Benedict as the dict handler, thanks to @psarka! diff --git a/README.md b/README.md index 0384ee48..ad46417f 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,16 @@ Package version + + Conda Versions + + Python Versions - Documentation Status + Documentation Status

@@ -94,12 +98,11 @@ If we are not sure or we don't know how to search for a particular dataset, we c ```python results = earthaccess.search_data( - short_name='ATL06', - version="005", + short_name='SEA_SURFACE_HEIGHT_ALT_GRIDS_L4_2SATS_5DAY_6THDEG_V_JPL2205', cloud_hosted=True, bounding_box=(-10, 20, 10, 50), - temporal=("2020-02", "2020-03"), - count=100 + temporal=("1999-02", "2019-03"), + count=10 ) @@ -140,7 +143,9 @@ This method works best if you are in the same Amazon Web Services (AWS) region a ```python import xarray as xr -ds = xr.open_mfdataset(earthaccess.open(results)) +files = earthaccess.open(results) + +ds = xr.open_mfdataset(files) ``` diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 120000 index 00000000..44fcc634 --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1 @@ +../CONTRIBUTING.md \ No newline at end of file diff --git a/docs/tutorials/authenticate.md b/docs/tutorials/authenticate.md index b280f457..e685941d 100644 --- a/docs/tutorials/authenticate.md +++ b/docs/tutorials/authenticate.md @@ -1,7 +1,13 @@ ## Authenticate with Earthdata Login +earthaccess can use environment variables, `.netrc` file or interactive input from a user to login with NASA EDL. + +If a strategy is not especified, env vars will be used first, then netrc and finally user's input. + ```py import earthaccess + +auth = earthaccess.login() ``` If you have a .netrc file with your Earthdata Login credentials diff --git a/docs/tutorials/edl.ipynb b/docs/tutorials/file-access.ipynb similarity index 99% rename from docs/tutorials/edl.ipynb rename to docs/tutorials/file-access.ipynb index a1946000..6a4fe498 100644 --- a/docs/tutorials/edl.ipynb +++ b/docs/tutorials/file-access.ipynb @@ -106,7 +106,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.14" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/docs/tutorials/demo.ipynb b/docs/tutorials/queries.ipynb similarity index 81% rename from docs/tutorials/demo.ipynb rename to docs/tutorials/queries.ipynb index 71a7e6ac..766a270e 100644 --- a/docs/tutorials/demo.ipynb +++ b/docs/tutorials/queries.ipynb @@ -8,11 +8,7 @@ }, "source": [ "\n", - "## Overview\n", - "\n", - "\n", - "# Introducing NASA earthaccess 🌍\n", - "\n", + "# Querying CMR using earthaccess\n", "\n", "\n", "#### TL;DR: [**earthaccess**](https://github.com/nsidc/earthaccess) is a Python package to search, preview and access NASA datasets (on-prem or in the cloud) with a few lines of code.\n", @@ -38,20 +34,91 @@ "\n", "Earthdata Login provides free and immediate access to thousands of EOSDIS data products covering all Earth science disciplines and topic areas for researchers, applied science users, application developers, and the general public.\n", "\n", - "Once we have our NASA EDL login credentials we can start accessing NASA data in a programmatic way.\n" + "Once we have our NASA EDL login credentials we can start accessing NASA data in a programmatic way.\n", + "\n", + "\n", + "## Querying CMR using earthaccess\n", + "\n", + "This short tutorial uses the `collection_query()` and `granule_query()` methods, these methods return a lower level Query Builder instance that can be used to query NASA's CMR.\n", + "For convenience the top level API also offers the `dataset_search(**kwargs)` and `data_search(**kwargs)` methods that map what these query builders do. \n", + "\n", + "For instance \n", + "\n", + "```python\n", + "query = earthaccess.granule_query().doi(\"some_doi\").temporal(\"1990-01-01\", \"2020-12-31\").cloud_hosted(True)\n", + "granules = query.get(10)\n", + "\n", + "```\n", + "\n", + "is equivalent to\n", + "\n", + "```python\n", + "granules = earthaccess.search_data(\n", + " doi=\"some_doi\",\n", + " temporal = (\"1990-01-01\",\"2020-12-31\"),\n", + " cloud_hosted=True,\n", + " limit=10\n", + ")\n", + "```" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "10f6c9ed-fe58-4e03-b29b-c6c447061f84", - "metadata": {}, - "outputs": [], + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.5.3'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import earthaccess\n", "earthaccess.__version__" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "496c1e3e-5b1a-44f8-ae13-84c42ea814af", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EARTHDATA_USERNAME and EARTHDATA_PASSWORD are not set in the current environment, try setting them or use a different strategy (netrc, interactive)\n", + "You're now authenticated with NASA Earthdata Login\n", + "Using token with expiration date: 09/24/2023\n", + "Using .netrc file for EDL\n" + ] + } + ], + "source": [ + "auth = earthaccess.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39ba3dfb-a2b3-459a-ba51-dd6446c20872", + "metadata": {}, + "outputs": [], + "source": [ + "token yarn" + ] + }, { "cell_type": "markdown", "id": "95121ff7-5222-4778-a4de-25625e23884b", @@ -85,12 +152,14 @@ "cell_type": "code", "execution_count": null, "id": "caab3b4b-80cc-4790-9417-1dd12503aa55", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# are we authenticated?\n", "\n", - "auth = earthaccess.login(strategy=\"netrc\")\n" + "auth = earthaccess.login()\n" ] }, { @@ -107,7 +176,9 @@ "cell_type": "code", "execution_count": null, "id": "8d5bf4c9-571b-4c93-af94-e66bd51cb584", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# The first step is to create a DataCollections query \n", @@ -139,7 +210,9 @@ "cell_type": "code", "execution_count": null, "id": "8cb5154c-f131-44ad-a68f-cf0fa21ce18f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "collections[0][\"umm\"][\"ShortName\"]" @@ -171,11 +244,13 @@ "cell_type": "code", "execution_count": null, "id": "48cdcd74-dfe3-4b83-93f4-7378a0d981df", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# We can now search for collections using a pythonic API client for CMR.\n", - "Query = earthaccess.collection_query().daac(\"PODAAC\")\n", + "Query = earthaccess.collection_query().daac(\"ASF\")\n", "\n", "print(f'Collections found: {Query.hits()}')\n", "collections = Query.fields(['ShortName']).get(10)\n", @@ -187,11 +262,13 @@ "cell_type": "code", "execution_count": null, "id": "63792353-ab3e-4f0b-963d-7750e4b89113", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# What if we want cloud collections\n", - "Query = earthaccess.collection_query().daac(\"PODAAC\").cloud_hosted(True)\n", + "Query = earthaccess.collection_query().daac(\"ASF\").cloud_hosted(True)\n", "\n", "print(f'Collections found: {Query.hits()}')\n", "collections = Query.fields(['ShortName']).get(10)\n", @@ -203,7 +280,9 @@ "cell_type": "code", "execution_count": null, "id": "c4c5a34a-e808-4cc9-b34d-353d091a8242", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Printing the concept-id for the first 10 collections\n", @@ -230,17 +309,18 @@ "cell_type": "code", "execution_count": null, "id": "9364d737-5a79-4089-853f-76d2ad1c85a7", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from pprint import pprint\n", "\n", "# We build our query\n", "\n", - "Query = earthaccess.granule_query().short_name('ATL06').version(\"005\").bounding_box(-134.7,58.9,-133.9,59.2)\n", + "Query = earthaccess.granule_query().short_name('HLSL30').bounding_box(-134.7,58.9,-133.9,59.2)\n", "# We get 5 metadata records\n", - "granules = Query.get(5)\n", - "granules" + "granules = Query.get(5)" ] }, { @@ -257,12 +337,13 @@ { "cell_type": "code", "execution_count": null, - "id": "66cd5f5c-a854-4a72-a831-33b8bd7ce9d2", - "metadata": {}, + "id": "0b56b119-ec9b-4922-911a-f37501597451", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# printing 2 granules using display\n", - "[display(granule) for granule in granules]" + "[display(g) for g in granules]" ] }, { @@ -280,7 +361,9 @@ "cell_type": "code", "execution_count": null, "id": "00aa39ec-e2fb-49d1-bc54-8d8a2f0655aa", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "Query = earthaccess.granule_query().short_name(\"ATL06\").temporal(\"2020-03-01\", \"2020-03-30\").bounding_box(-134.7,58.9,-133.9,59.2).version(\"005\")\n", @@ -292,7 +375,9 @@ "cell_type": "code", "execution_count": null, "id": "8c493585-0d48-41bb-8815-6c83ad20ae80", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Now we can print some info about these granules using the built-in methods\n", @@ -313,6 +398,27 @@ "## On-prem access: DAAC hosted data 📡\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7b80520-5cae-45c5-9397-f990a1ba0f26", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "granules = []\n", + "\n", + "# we just grab 1 granule from May for each year of the dataset\n", + "for year in range(1999, 2019):\n", + " results = earthaccess.search_data(\n", + " doi = \"10.5067/SLREF-CDRV3\",\n", + " temporal=(f\"{year}-05\", f\"{year}-06\")\n", + " )\n", + " if len(results)>0:\n", + " granules.append(results[0])" + ] + }, { "cell_type": "markdown", "id": "4239e041-db87-40d1-b81a-12c26e9e0a47", @@ -325,15 +431,18 @@ "cell_type": "code", "execution_count": null, "id": "910e4b90-f0e0-42e5-a4e2-d5444089161f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import earthaccess\n", "\n", "earthaccess.login()\n", "\n", - "Query = earthaccess.granule_query().short_name(\"SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5\").bounding_box(-134.7,54.9,-100.9,69.2)\n", + "Query = earthaccess.granule_query().doi(\"10.5067/SLREF-CDRV3\").bounding_box(-134.7,54.9,-100.9,69.2)\n", "print(f\"Granule hits: {Query.hits()}\")\n", + "\n", "# getting more than 6,000 metadata records for demo purposes is going to slow us down a bit so let's get only a few\n", "granules = Query.get(10)\n", "# Does this granule belong to a cloud-based collection?\n", @@ -354,11 +463,13 @@ "cell_type": "code", "execution_count": null, "id": "434466a3-602b-4dff-a260-f7db6901514a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%%time\n", - "files = earthaccess.download(granules[0:4], \"./data/C1972955240-PODAAC/\")" + "files = earthaccess.download(granules[0:2], \"./data/C1972955240-PODAAC/\")" ] }, { @@ -381,7 +492,9 @@ "cell_type": "code", "execution_count": null, "id": "44403d51-0aa3-423c-8fff-e40d4969aa9d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "\n", @@ -396,7 +509,9 @@ "cell_type": "code", "execution_count": null, "id": "5e59ca3e-b5d5-490f-b967-01d1c7b3fdf0", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Let's pretty print this\n", @@ -407,7 +522,9 @@ "cell_type": "code", "execution_count": null, "id": "b2a294f1-b1f9-4cd4-8751-dfc32feacec1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%%time\n", @@ -436,7 +553,9 @@ "cell_type": "code", "execution_count": null, "id": "aecdb529-5961-4fa6-b7e0-70bbd0d85041", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import warnings\n", @@ -451,8 +570,7 @@ "\n", "for granule in results:\n", " https_links.extend(granule.data_links(access=\"on_prem\"))\n", - " s3_links.extend(granule.data_links(access=\"direct\"))\n", - "s3_links" + " s3_links.extend(granule.data_links(access=\"direct\"))" ] }, { @@ -466,19 +584,17 @@ { "cell_type": "code", "execution_count": null, - "id": "e693af6a-a80e-4ca2-a034-8da194c18aaf", - "metadata": {}, + "id": "50e6f01e-86f0-4e29-869b-d6d437c8b130", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "%%time\n", + "fileset = earthaccess.open(results[0:3])\n", "\n", - "ds_L3 = xr.open_mfdataset(\n", - " earthaccess.open(results[0:3]),\n", - " combine='nested',\n", - " concat_dim='time',\n", - " coords='minimal',\n", - " )\n", - "ds_L3" + "# test that we can read data from the files\n", + "with fileset[0] as f:\n", + " print(f.read(100))" ] }, { @@ -548,7 +664,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.14" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/docs/tutorials/restricted-datasets.ipynb b/docs/tutorials/restricted-datasets.ipynb index 942e849b..437193be 100644 --- a/docs/tutorials/restricted-datasets.ipynb +++ b/docs/tutorials/restricted-datasets.ipynb @@ -309,7 +309,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/earthaccess/api.py b/earthaccess/api.py index e17b8229..ccb62200 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -1,10 +1,11 @@ from typing import Any, Dict, List, Optional, Type, Union -import earthaccess import requests import s3fs from fsspec import AbstractFileSystem +import earthaccess + from .auth import Auth from .search import CollectionQuery, DataCollections, DataGranules, GranuleQuery from .store import Store @@ -168,8 +169,7 @@ def download( except AttributeError as err: print(err) print("You must call earthaccess.login() before you can download data") - return None - + return [] return results diff --git a/earthaccess/formatters.py b/earthaccess/formatters.py index fbbaba86..d37d4e22 100644 --- a/earthaccess/formatters.py +++ b/earthaccess/formatters.py @@ -23,7 +23,7 @@ def _repr_granule_html(granule: Any) -> str: css_inline = f"""""" - style = "max-height: 140px;" + style = "max-height: 120px;" dataviz_img = "".join( [ f'Data Preview' @@ -47,7 +47,7 @@ def _repr_granule_html(granule: Any) -> str:

Data: {data_links}

Size: {granule_size} MB

-

Spatial: {granule["umm"]["SpatialExtent"]}

+

Cloud Hosted: {granule.cloud_hosted}

{dataviz_img} diff --git a/earthaccess/results.py b/earthaccess/results.py index 4115fc45..92d0b1a3 100644 --- a/earthaccess/results.py +++ b/earthaccess/results.py @@ -247,7 +247,7 @@ def size(self) -> float: Returns the total size for the granule in MB """ try: - data_granule = self["mmm"]["DataGranule"] + data_granule = self["umm"]["DataGranule"] total_size = sum( [ float(s["Size"]) @@ -257,7 +257,7 @@ def size(self) -> float: ) except Exception: try: - data_granule = self["mmm"]["DataGranule"] + data_granule = self["umm"]["DataGranule"] total_size = sum( [ float(s["SizeInBytes"]) diff --git a/earthaccess/search.py b/earthaccess/search.py index b3b86fca..0fd8e60b 100644 --- a/earthaccess/search.py +++ b/earthaccess/search.py @@ -412,6 +412,24 @@ def cloud_hosted(self, cloud_hosted: bool = True) -> Type[CollectionQuery]: self.params["provider"] = provider return self + def granule_name(self, granule_name: str) -> Type[CollectionQuery]: + """Find granules matching either granule ur or producer granule id, + queries using the readable_granule_name metadata field. + + ???+ Tip + We can use wirldcards on a granule name to further refine our search + i.e. MODGRNLD.*.daily.* + + Parameters: + granule_name (String): granule name (accepts wildcards) + """ + if not isinstance(granule_name, str): + raise TypeError("granule_name must be of type string") + + self.params["readable_granule_name"] = granule_name + self.params["options[readable_granule_name][pattern]"] = True + return self + def online_only(self, online_only: bool = True) -> Type[GranuleQuery]: """Only match granules that are listed online and not available for download. The opposite of this method is downloadable(). @@ -663,3 +681,23 @@ def downloadable(self, downloadable: bool = True) -> Type[GranuleQuery]: """ super().downloadable(downloadable) return self + + def doi(self, doi: str) -> Type[GranuleQuery]: + """Searh data granules by DOI + + ???+ Tip + Not all datasets have an associated DOI, internally if a DOI is found + earthaccess will grab the concept_id for the query to CMR. + + Parameters: + doi (String): DOI of a datasets, e.g. 10.5067/AQR50-3Q7CS + """ + collection = DataCollections().doi(doi).get() + if len(collection) > 0: + concept_id = collection[0].concept_id() + self.params["concept_id"] = concept_id + else: + print( + f"earthaccess couldn't find any associated collections with the DOI: {doi}" + ) + return self diff --git a/earthaccess/store.py b/earthaccess/store.py index 591aee21..b08d37a6 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -3,38 +3,70 @@ import shutil import traceback from copy import deepcopy +from functools import lru_cache from itertools import chain from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union from uuid import uuid4 -from functools import lru_cache -import earthaccess import fsspec import requests import s3fs from multimethod import multimethod as singledispatchmethod from pqdm.threads import pqdm +import earthaccess + from .daac import DAAC_TEST_URLS, find_provider from .results import DataGranule from .search import DataCollections -def _open_files(files, granules, fs): - def multi_thread_open(data) -> Any: - url, granule = data - return EarthAccessFile(fs.open(url), granule) +class EarthAccessFile(fsspec.spec.AbstractBufferedFile): + def __init__(self, f: fsspec.AbstractFileSystem, granule: DataGranule) -> None: + self.f = f + self.granule = granule + + def __getattr__(self, method: str) -> Any: + return getattr(self.f, method) + + def __reduce__(self) -> Any: + return make_instance, ( + type(self.f), + self.f, + self.f.__reduce__(), + ) + + def __repr__(self) -> str: + return str(self.f) - fileset = pqdm(zip(files, granules), multi_thread_open, n_jobs=8) + +def _open_files( + data_links: List[str], + granules: Union[List[str], List[DataGranule]], + fs: fsspec.AbstractFileSystem, + threads: Optional[int] = 8, +) -> List[fsspec.AbstractFileSystem]: + def multi_thread_open(data: tuple) -> EarthAccessFile: + urls, granule = data + if type(granule) is not str: + if len(granule.data_links()) > 1: + print( + "Warning: This collection contains more than one file per granule. " + "earthaccess will only open the first data link, " + "try filtering the links before opening them." + ) + return EarthAccessFile(fs.open(urls), granule) + + fileset = pqdm(zip(data_links, granules), multi_thread_open, n_jobs=threads) return fileset -def make_instance(cls, granule, _reduce): +def make_instance(cls: Any, granule: DataGranule, _reduce: Any) -> EarthAccessFile: if earthaccess.__store__.running_in_aws and cls is not s3fs.S3File: # On AWS but not using a S3File. Reopen the file in this case for direct S3 access. # NOTE: This uses the first data_link listed in the granule. That's not - # guaranteed to be the right one. + # guaranteed to be the right one. return EarthAccessFile(earthaccess.open([granule])[0], granule) else: func = _reduce[0] @@ -42,22 +74,6 @@ def make_instance(cls, granule, _reduce): return func(*args) -class EarthAccessFile(fsspec.spec.AbstractBufferedFile): - def __init__(self, f, granule): - self.f = f - self.granule = granule - - def __getattr__(self, method): - return getattr(self.f, method) - - def __reduce__(self): - return make_instance, ( - type(self.f), - self.granule, - self.f.__reduce__(), - ) - - class Store(object): """ Store class to access granules on-prem or in the cloud. @@ -209,7 +225,9 @@ def get_fsspec_session(self) -> fsspec.AbstractFileSystem: token = self.auth.token["access_token"] client_kwargs = { "headers": {"Authorization": f"Bearer {token}"}, - "trust_env": True, + # This is important! if we trust the env end send a bearer token + # auth will fail! + "trust_env": False, } session = fsspec.filesystem("https", client_kwargs=client_kwargs) return session @@ -265,6 +283,7 @@ def _open_granules( self, granules: List[DataGranule], provider: Optional[str] = None, + threads: Optional[int] = 8, ) -> Union[List[Any], None]: fileset: List = [] data_links: List = [] @@ -294,7 +313,12 @@ def _open_granules( if s3_fs is not None: try: - fileset = _open_files(data_links, granules, s3_fs) + fileset = _open_files( + data_links=data_links, + granules=granules, + fs=s3_fs, + threads=threads, + ) except Exception: print( "An exception occurred while trying to access remote files on S3: " @@ -303,7 +327,7 @@ def _open_granules( ) return None else: - fileset = self._open_urls_https(data_links, granules, n_jobs=8) + fileset = self._open_urls_https(data_links, granules, threads=threads) return fileset else: access_method = "on_prem" @@ -312,7 +336,7 @@ def _open_granules( granule.data_links(access=access_method) for granule in granules ) ) - fileset = self._open_urls_https(data_links, granules, n_jobs=8) + fileset = self._open_urls_https(data_links, granules, threads=threads) return fileset @_open.register @@ -320,6 +344,7 @@ def _open_urls( self, granules: List[str], provider: Optional[str] = None, + threads: Optional[int] = 8, ) -> Union[List[Any], None]: fileset: List = [] data_links: List = [] @@ -346,7 +371,12 @@ def _open_urls( s3_fs = self.get_s3fs_session(provider=provider) if s3_fs is not None: try: - fileset = _open_files(data_links, granules, s3_fs) + fileset = _open_files( + data_links=data_links, + granules=granules, + fs=s3_fs, + threads=threads, + ) except Exception: print( "An exception occurred while trying to access remote files on S3: " @@ -570,12 +600,15 @@ def _download_onprem_granules( return results def _open_urls_https( - self, urls: List[str] = [], granules=[], n_jobs: int = 8 + self, + urls: List[str], + granules: Union[List[str], List[DataGranule]], + threads: Optional[int] = 8, ) -> List[fsspec.AbstractFileSystem]: https_fs = self.get_fsspec_session() if https_fs is not None: try: - fileset = _open_files(urls, granules, https_fs) + fileset = _open_files(urls, granules, https_fs, threads) except Exception: print( "An exception occurred while trying to access remote files via HTTPS: " diff --git a/mkdocs.yml b/mkdocs.yml index a2fa5262..5fdbc4f2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,7 +45,8 @@ plugins: nav: - OVERVIEW: 'index.md' - TUTORIALS: - - 'Introducing NASA earthaccess': 'tutorials/demo.ipynb' + - 'Querying CMR using earthaccess': 'tutorials/queries.ipynb' + - 'Accesing files with fsspec': 'tutorials/file-access.ipynb' - 'Search and access restricted datasets': 'tutorials/restricted-datasets.ipynb' - HOW-TO: - 'Authenticate with Earthdata Login': 'tutorials/authenticate.md' diff --git a/pyproject.toml b/pyproject.toml index 82cc80fa..8949d1dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "earthaccess" -version = "0.5.2" +version = "0.5.3" homepage = "https://github.com/nsidc/earthaccess" description = "Client library for NASA Earthdata APIs" authors = ["earthaccess contributors"] diff --git a/scripts/docs-live.sh b/scripts/docs-live.sh index 9d36ebf6..286f2a7c 100755 --- a/scripts/docs-live.sh +++ b/scripts/docs-live.sh @@ -4,4 +4,4 @@ set -e set -x -mkdocs serve --dev-addr 0.0.0.0:8008 +mkdocs serve --dev-addr 0.0.0.0:8008 --dirtyreload diff --git a/scripts/integration-test.sh b/scripts/integration-test.sh new file mode 100755 index 00000000..2d55064f --- /dev/null +++ b/scripts/integration-test.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -e +set -x + +pytest --cov=earthaccess --cov=tests/integration --cov-report=term-missing ${@} -s --tb=native --log-cli-level=INFO +bash ./scripts/lint.sh diff --git a/scripts/test.sh b/scripts/test.sh index 42968ec5..1e7129d9 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -3,5 +3,5 @@ set -e set -x -pytest --cov=earthaccess --cov=tests --cov-report=term-missing ${@} -s --tb=native --log-cli-level=INFO +pytest tests/unit --cov=earthaccess --cov=tests --cov-report=term-missing ${@} -s --tb=native --log-cli-level=INFO bash ./scripts/lint.sh diff --git a/tests/integration/test_onprem_download.py b/tests/integration/test_onprem_download.py index 919df254..d54404c8 100644 --- a/tests/integration/test_onprem_download.py +++ b/tests/integration/test_onprem_download.py @@ -30,14 +30,6 @@ "granules_sample_size": 2, "granules_max_size_mb": 130, }, - { - "short_name": "PODAAC", - "collections_count": 100, - "collections_sample_size": 2, - "granules_count": 100, - "granules_sample_size": 2, - "granules_max_size_mb": 100, - }, { "short_name": "LPDAAC", "collections_count": 100, diff --git a/tests/integration/test_onprem_open.py b/tests/integration/test_onprem_open.py index 8ebca883..49093811 100644 --- a/tests/integration/test_onprem_open.py +++ b/tests/integration/test_onprem_open.py @@ -37,14 +37,6 @@ "granules_sample_size": 2, "granules_max_size_mb": 100, }, - { - "short_name": "LPDAAC", - "collections_count": 100, - "collections_sample_size": 2, - "granules_count": 100, - "granules_sample_size": 2, - "granules_max_size_mb": 100, - }, { "short_name": "ORNLDAAC", "collections_count": 100,