From df69219d14a2fcc653e1d1270e88671ef0dfd32e Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 1 Sep 2023 14:21:15 -0400 Subject: [PATCH] [BUGFIX] Raise warning if data_connector cannot be built on config load (#8658) --- .../datasource/fluent/interfaces.py | 36 ++++++++++++++++--- tests/datasource/fluent/conftest.py | 16 ++++++--- tests/datasource/fluent/test_contexts.py | 31 ++++++++++++++++ 3 files changed, 75 insertions(+), 8 deletions(-) diff --git a/great_expectations/datasource/fluent/interfaces.py b/great_expectations/datasource/fluent/interfaces.py index a129d3ad4378..8bb225ff2c81 100644 --- a/great_expectations/datasource/fluent/interfaces.py +++ b/great_expectations/datasource/fluent/interfaces.py @@ -568,12 +568,40 @@ def _save_context_project_config(self) -> Union[Datasource, None]: return None def _rebuild_asset_data_connectors(self) -> None: - """If Datasource required a data_connector we need to build the data_connector for each asset""" + """ + If Datasource required a data_connector we need to build the data_connector for each asset. + + A warning is raised if a data_connector cannot be built for an asset. + Not all users will have access to the needed dependencies (packages or credentials) for every asset. + Missing dependencies will stop them from using the asset but should not stop them from loading it from config. + """ + asset_build_failure_direct_cause: dict[str, Exception | BaseException] = {} + if self.data_connector_type: for data_asset in self.assets: - # check if data_connector exist before rebuilding? - connect_options = getattr(data_asset, "connect_options", {}) - self._build_data_connector(data_asset, **connect_options) + try: + # check if data_connector exist before rebuilding? + connect_options = getattr(data_asset, "connect_options", {}) + self._build_data_connector(data_asset, **connect_options) + except Exception as dc_build_err: + logger.info( + f"Unable to build data_connector for {self.type} {data_asset.type} {data_asset.name}", + exc_info=True, + ) + # reveal direct cause instead of generic, unhelpful MyDatasourceError + asset_build_failure_direct_cause[data_asset.name] = ( + dc_build_err.__cause__ or dc_build_err + ) + if asset_build_failure_direct_cause: + # TODO: allow users to opt out of these warnings + names_and_error: List[str] = [ + f"{name}:{type(exc).__name__}" + for (name, exc) in asset_build_failure_direct_cause.items() + ] + warnings.warn( + f"data_connector build failure for {self.name} assets - {', '.join(names_and_error)}", + category=RuntimeWarning, + ) @staticmethod def parse_order_by_sorters( diff --git a/tests/datasource/fluent/conftest.py b/tests/datasource/fluent/conftest.py index 6e580c43293d..49d0a1a2b42b 100644 --- a/tests/datasource/fluent/conftest.py +++ b/tests/datasource/fluent/conftest.py @@ -3,6 +3,7 @@ import functools import logging import pathlib +import warnings from pprint import pformat as pf from typing import ( TYPE_CHECKING, @@ -330,6 +331,13 @@ def cloud_storage_get_client_doubles( ) +@pytest.fixture +def filter_data_connector_build_warning(): + with warnings.catch_warnings() as w: + warnings.simplefilter("ignore", RuntimeWarning) + yield w + + @pytest.fixture def fluent_only_config( fluent_gx_config_yml_str: str, seed_ds_env_vars: tuple @@ -365,7 +373,7 @@ def fluent_yaml_config_file( @pytest.fixture @functools.lru_cache(maxsize=1) def seeded_file_context( - cloud_storage_get_client_doubles, + filter_data_connector_build_warning, fluent_yaml_config_file: pathlib.Path, seed_ds_env_vars: tuple, ) -> FileDataContext: @@ -378,7 +386,7 @@ def seeded_file_context( @pytest.fixture def seed_cloud( - cloud_storage_get_client_doubles, + filter_data_connector_build_warning, cloud_api_fake: responses.RequestsMock, fluent_only_config: GxConfig, ): @@ -410,8 +418,8 @@ def seeded_cloud_context( @pytest.fixture( params=[ - pytest.param("seeded_file_context", marks=pytest.mark.filesystem), - pytest.param("seeded_cloud_context", marks=pytest.mark.cloud), + pytest.param("seeded_file_context", marks=[pytest.mark.filesystem]), + pytest.param("seeded_cloud_context", marks=[pytest.mark.cloud]), ] ) def seeded_contexts( diff --git a/tests/datasource/fluent/test_contexts.py b/tests/datasource/fluent/test_contexts.py index 38ae9e22e38b..32e689f66add 100644 --- a/tests/datasource/fluent/test_contexts.py +++ b/tests/datasource/fluent/test_contexts.py @@ -4,6 +4,7 @@ import pathlib import re import urllib.parse +from collections import defaultdict from pprint import pformat as pf from typing import TYPE_CHECKING @@ -367,5 +368,35 @@ def test_payload_sent_to_cloud( ) +@pytest.mark.filesystem +def test_data_connectors_are_built_on_config_load( + cloud_storage_get_client_doubles, + seeded_file_context: FileDataContext, +): + """ + Ensure that all Datasources that require data_connectors have their data_connectors + created when loaded from config. + """ + context = seeded_file_context + dc_datasources: dict[str, list[str]] = defaultdict(list) + + assert context.fluent_datasources + for datasource in context.fluent_datasources.values(): + if datasource.data_connector_type: + print(f"class: {datasource.__class__.__name__}") + print(f"type: {datasource.type}") + print(f"data_connector: {datasource.data_connector_type.__name__}") + print(f"name: {datasource.name}", end="\n\n") + + dc_datasources[datasource.type].append(datasource.name) + + for asset in datasource.assets: + assert isinstance(asset._data_connector, datasource.data_connector_type) + print() + + print(f"Datasources with DataConnectors\n{pf(dict(dc_datasources))}") + assert dc_datasources + + if __name__ == "__main__": pytest.main([__file__, "-vv"])