From b9fa26cca8dd5ba0e459acd2b747abc8a57413f3 Mon Sep 17 00:00:00 2001 From: betolink Date: Fri, 25 Aug 2023 12:10:32 -0500 Subject: [PATCH 1/8] testing getting the credentials from the metadata, works for collections with different ad-hoc providers --- earthaccess/auth.py | 13 +++++++++++-- earthaccess/store.py | 39 ++++++++++++++++++++++++++++++++------- tests/unit/test_store.py | 18 ++++++++++++++++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/earthaccess/auth.py b/earthaccess/auth.py index cf0cd971..80fbcec1 100644 --- a/earthaccess/auth.py +++ b/earthaccess/auth.py @@ -138,7 +138,10 @@ def refresh_tokens(self) -> bool: return False def get_s3_credentials( - self, daac: Optional[str] = "", provider: Optional[str] = "" + self, + daac: Optional[str] = None, + provider: Optional[str] = None, + endpoint: Optional[str] = None, ) -> Dict[str, str]: """Gets AWS S3 credentials for a given NASA cloud provider, the easier way is to use the DAAC short name. provider is optional if we know it. @@ -146,6 +149,7 @@ def get_s3_credentials( Parameters: provider: A valid cloud provider, each DAAC has a provider code for their cloud distributions daac: the name of a NASA DAAC, i.e. NSIDC or PODAAC + endpoint: getting the credentials directly from the S3Credentials URL Rreturns: A Python dictionary with the temporary AWS S3 credentials @@ -153,7 +157,12 @@ def get_s3_credentials( """ if self.authenticated: session = SessionWithHeaderRedirection(self.username, self.password) - auth_url = self._get_cloud_auth_url(daac_shortname=daac, provider=provider) + if endpoint is None: + auth_url = self._get_cloud_auth_url( + daac_shortname=daac, provider=provider + ) + else: + auth_url = endpoint if auth_url.startswith("https://"): cumulus_resp = session.get(auth_url, timeout=15, allow_redirects=True) auth_resp = session.get( diff --git a/earthaccess/store.py b/earthaccess/store.py index b08d37a6..dbc16bc4 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -6,7 +6,7 @@ from functools import lru_cache from itertools import chain from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union from uuid import uuid4 import fsspec @@ -119,6 +119,12 @@ def _is_cloud_collection(self, concept_id: List[str]) -> bool: return True return False + def _own_s3_credentials(self, links: List[Dict[str, Any]]) -> Union[str, None]: + for link in links: + if link["URL"].contains("/s3credentials"): + return link["URL"] + return None + def _am_i_in_aws(self) -> bool: session = self.auth.get_session() try: @@ -172,22 +178,27 @@ def get_s3fs_session( daac: Optional[str] = None, concept_id: Optional[str] = None, provider: Optional[str] = None, + endpoint: Optional[str] = None, ) -> s3fs.S3FileSystem: """ Returns a s3fs instance for a given cloud provider / DAAC Parameters: daac: any of the DAACs e.g. NSIDC, PODAAC + provider: a data provider if we know them, e.g PODAAC -> POCLOUD + endpoint: pass the URL for the credentials directly Returns: a s3fs file instance """ if self.auth is not None: - if not any([concept_id, daac, provider]): + if not any([concept_id, daac, provider, endpoint]): raise ValueError( - "At least one of the concept_id, daac, or provider " + "At least one of the concept_id, daac, provider or endpoint" "parameters must be specified. " ) - if concept_id is not None: + if endpoint is not None: + s3_credentials = self.auth.get_s3_credentials(endpoint=endpoint) + elif concept_id is not None: provider = self._derive_concept_provider(concept_id) s3_credentials = self.auth.get_s3_credentials(provider=provider) elif daac is not None: @@ -300,7 +311,14 @@ def _open_granules( if granules[0].cloud_hosted: access_method = "direct" provider = granules[0]["meta"]["provider-id"] - s3_fs = self.get_s3fs_session(provider=provider) + # if the data has its own S3 credentials endpoint we'll use it + endpoint = self._own_s3_credentials(granules[0]["umm"]["RelatedUrls"]) + if endpoint is not None: + print(f"using endpoint: {endpoint}") + s3_fs = self.get_s3fs_session(endpoint=endpoint) + else: + print(f"using provider: {provider}") + s3_fs = self.get_s3fs_session(provider=provider) else: access_method = "on_prem" s3_fs = None @@ -503,6 +521,7 @@ def _get_granules( data_links: List = [] downloaded_files: List = [] provider = granules[0]["meta"]["provider-id"] + endpoint = self._own_s3_credentials(granules[0]["umm"]["RelatedUrls"]) cloud_hosted = granules[0].cloud_hosted access = "direc" if (cloud_hosted and self.running_in_aws) else "external" data_links = list( @@ -517,8 +536,14 @@ def _get_granules( f" Getting {len(granules)} granules, approx download size: {total_size} GB" ) if access == "direct": - print(f"Accessing cloud dataset using provider: {provider}") - s3_fs = self.get_s3fs_session(provider) + if endpoint is not None: + print( + f"Accessing cloud dataset using dataset endpoint credentials: {endpoint}" + ) + s3_fs = self.get_s3fs_session(endpoint=endpoint) + else: + print(f"Accessing cloud dataset using provider: {provider}") + s3_fs = self.get_s3fs_session(provider=provider) # TODO: make this async for file in data_links: s3_fs.get(file, local_path) diff --git a/tests/unit/test_store.py b/tests/unit/test_store.py index 1823f9d4..388cb5af 100644 --- a/tests/unit/test_store.py +++ b/tests/unit/test_store.py @@ -55,6 +55,20 @@ def test_store_can_create_https_fsspec_session(self): def test_store_can_create_s3_fsspec_session(self): from earthaccess.daac import DAACS + custom_endpoints = [ + "https://archive.swot.podaac.earthdata.nasa.gov/s3credentials", + "https://api.giovanni.earthdata.nasa.gov/s3credentials", + "https://data.laadsdaac.earthdatacloud.nasa.gov/s3credentials", + ] + + for endpoint in custom_endpoints: + responses.add( + responses.GET, + endpoint, + json={}, + status=200, + ) + for daac in DAACS: if "s3-credentials" in daac: responses.add( @@ -80,6 +94,10 @@ def test_store_can_create_s3_fsspec_session(self): s3_fs = store.get_s3fs_session(daac=daac) self.assertEqual(type(s3_fs), type(fsspec.filesystem("s3"))) + for endpoint in custom_endpoints: + s3_fs = store.get_s3fs_session(endpoint=endpoint) + self.assertEqual(type(s3_fs), type(fsspec.filesystem("s3"))) + for provider in [ "NSIDC_CPRD", "POCLOUD", From 80ea1d8e76806467acd69145ee8fcb722e9e5a28 Mon Sep 17 00:00:00 2001 From: betolink Date: Fri, 25 Aug 2023 12:45:51 -0500 Subject: [PATCH 2/8] fix typo --- earthaccess/store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index dbc16bc4..2fc8406e 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -121,7 +121,7 @@ def _is_cloud_collection(self, concept_id: List[str]) -> bool: def _own_s3_credentials(self, links: List[Dict[str, Any]]) -> Union[str, None]: for link in links: - if link["URL"].contains("/s3credentials"): + if "/s3credentials" in link["URL"]: return link["URL"] return None From 19170c2f5d8633fcfe3b5e4aac3570cca9a0234a Mon Sep 17 00:00:00 2001 From: betolink Date: Mon, 28 Aug 2023 16:58:52 -0500 Subject: [PATCH 3/8] refactor API call to get s3fs sessions using search results --- earthaccess/api.py | 22 +++++++++++++++++++--- earthaccess/results.py | 6 ++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/earthaccess/api.py b/earthaccess/api.py index ccb62200..c19a1bcc 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -7,7 +7,8 @@ import earthaccess from .auth import Auth -from .search import CollectionQuery, DataCollections, DataGranules, GranuleQuery +from .search import (CollectionQuery, DataCollections, DataGranules, + GranuleQuery) from .store import Store from .utils import _validation as validate @@ -54,7 +55,10 @@ def search_datasets( "Warning: a valid set of parameters is needed to search for datasets on CMR" ) return [] - query = DataCollections().parameters(**kwargs) + if earthaccess.__auth__.authenticated: + query = DataCollections(auth=earthaccess.__auth__).parameters(**kwargs) + else: + query = DataCollections().parameters(**kwargs) datasets_found = query.hits() print(f"Datasets found: {datasets_found}") if count > 0: @@ -284,13 +288,25 @@ class requests.Session: an authenticated requests Session instance. def get_s3fs_session( - daac: Optional[str] = None, provider: Optional[str] = None + daac: Optional[str] = None, + provider: Optional[str] = None, + results: Optional[earthaccess.results.DataGranule] = None, ) -> s3fs.S3FileSystem: """Returns a fsspec s3fs file session for direct access when we are in us-west-2 + Parameters: + daac (String): Any DAAC short name e.g. NSIDC, GES_DISC + provider (String): Each DAAC can have a cloud provider, if the DAAC is specified, there is no need to use provider + results (list[class earthaccess.results.DataGranule]): A list of results from search_data(), earthaccess will use the metadata form CMR to obtain the S3 Endpoint + Returns: class s3fs.S3FileSystem: an authenticated s3fs session valid for 1 hour """ + if results is not None: + endpoint = results[0].get_s3_credentials_endpoint() + if endpoint is not None: + session = earthaccess.__store__.get_s3fs_session(endpoint=endpoint) + return session session = earthaccess.__store__.get_s3fs_session(daac=daac, provider=provider) return session diff --git a/earthaccess/results.py b/earthaccess/results.py index 92d0b1a3..b959eca7 100644 --- a/earthaccess/results.py +++ b/earthaccess/results.py @@ -241,6 +241,12 @@ def _repr_html_(self) -> str: granule_html_repr = _repr_granule_html(self) return granule_html_repr + def get_s3_credentials_endpoint(self) -> Union[str, None]: + for link in self["umm"]["RelatedUrls"]: + if "/s3credentials" in link["URL"]: + return link["URL"] + return None + def size(self) -> float: """ Returns: From 0c69b099cf80aecb879576f3ffcc380875bba5f2 Mon Sep 17 00:00:00 2001 From: betolink Date: Mon, 28 Aug 2023 17:01:49 -0500 Subject: [PATCH 4/8] search_data uses authenticated searches too --- earthaccess/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/earthaccess/api.py b/earthaccess/api.py index c19a1bcc..d9f23363 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -104,7 +104,10 @@ def search_data( ) ``` """ - query = DataGranules().parameters(**kwargs) + if earthaccess.__auth__.authenticated: + query = DataGranules(earthaccess.__auth__).parameters(**kwargs) + else: + query = DataGranules().parameters(**kwargs) granules_found = query.hits() print(f"Granules found: {granules_found}") if count > 0: From fa1698e8d0aa399be952938ee3ef9b5dc81ba0d6 Mon Sep 17 00:00:00 2001 From: betolink Date: Mon, 28 Aug 2023 17:08:17 -0500 Subject: [PATCH 5/8] testing bumping version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5b171daa..9202c0bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "earthaccess" -version = "0.5.3" +version = "0.5.4" homepage = "https://github.com/nsidc/earthaccess" description = "Client library for NASA Earthdata APIs" authors = ["earthaccess contributors"] From 0054226f70aa28758a4f69a1b7286d5481e7638f Mon Sep 17 00:00:00 2001 From: betolink Date: Tue, 29 Aug 2023 08:51:58 -0500 Subject: [PATCH 6/8] fix linting --- earthaccess/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/earthaccess/api.py b/earthaccess/api.py index d9f23363..5358427d 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -7,8 +7,7 @@ import earthaccess from .auth import Auth -from .search import (CollectionQuery, DataCollections, DataGranules, - GranuleQuery) +from .search import CollectionQuery, DataCollections, DataGranules, GranuleQuery from .store import Store from .utils import _validation as validate From ca6590f6b6eed3da3222bea8465c884f624537aa Mon Sep 17 00:00:00 2001 From: betolink Date: Wed, 13 Sep 2023 10:13:31 -0500 Subject: [PATCH 7/8] earthaccess.get_s3_credentials can use the results to get the S3 credentials --- earthaccess/api.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/earthaccess/api.py b/earthaccess/api.py index 5358427d..799b9daa 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -197,13 +197,19 @@ def open( def get_s3_credentials( - daac: Optional[str] = None, provider: Optional[str] = None + daac: Optional[str] = None, + provider: Optional[str] = None, + results: Optional[list[earthaccess.results.DataGranule]] = None, ) -> Dict[str, Any]: - """Returns temporary (1 hour) credentials for direct access to NASA S3 buckets + """Returns temporary (1 hour) credentials for direct access to NASA S3 buckets, we can + use the daac name, the provider or a list of results from earthaccess.search_data() + if we use results earthaccess will use the metadata on the response to get the credentials, + this is useful for missions that do not use the same endpoint as their DAACs e.g. SWOT Parameters: - daac: a DAAC short_name like NSIDC or PODAAC etc - provider: if we know the provider for the DAAC e.g. POCLOUD, LPCLOUD etc. + daac (String): a DAAC short_name like NSIDC or PODAAC etc + provider (String: if we know the provider for the DAAC e.g. POCLOUD, LPCLOUD etc. + results (list[earthaccess.results.DataGranule]): List of results from search_data() Returns: a dictionary with S3 credentials for the DAAC or provider """ @@ -211,6 +217,9 @@ def get_s3_credentials( daac = daac.upper() if provider is not None: provider = provider.upper() + if results is not None: + endpoint = results[0].get_s3_credentials_endpoint() + return earthaccess.__auth__.get_s3_credentials(endpoint=endpoint) return earthaccess.__auth__.get_s3_credentials(daac=daac, provider=provider) From c4321c0f89265abed27bf73843ee4eebb997ca55 Mon Sep 17 00:00:00 2001 From: betolink Date: Wed, 13 Sep 2023 10:19:57 -0500 Subject: [PATCH 8/8] fix type --- earthaccess/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthaccess/api.py b/earthaccess/api.py index 799b9daa..31326753 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -199,7 +199,7 @@ def open( def get_s3_credentials( daac: Optional[str] = None, provider: Optional[str] = None, - results: Optional[list[earthaccess.results.DataGranule]] = None, + results: Optional[List[earthaccess.results.DataGranule]] = None, ) -> Dict[str, Any]: """Returns temporary (1 hour) credentials for direct access to NASA S3 buckets, we can use the daac name, the provider or a list of results from earthaccess.search_data()