diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7faf3e035..6723cb70f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,3 +10,4 @@ - [ ] Updated the documentation to reflect the code changes - [ ] Added a description of this change in the relevant `RELEASE.md` file - [ ] Added tests to cover my changes +- [ ] Received approvals from at least half of the TSC (required for adding a new, non-experimental dataset) diff --git a/.github/reviewers.json b/.github/reviewers.json new file mode 100644 index 000000000..5a0b265b8 --- /dev/null +++ b/.github/reviewers.json @@ -0,0 +1,36 @@ +{ + "teams": { + "everyone": { + "description": "A team that contains all TSC members.", + "users": [ + "ankatiyar", + "astrojuanlu", + "datajoely", + "deepyaman", + "DimedS", + "Galileo-Galilei", + "Huongg", + "idanov", + "jitu5", + "lrcouto", + "marrrcin", + "merelcht", + "noklam", + "rashidakanchwala", + "ravi-kumar-pilla", + "SajidAlamQB", + "sbrugman", + "stephkaiser", + "tynandebold", + "yetudada" + ] + } + }, + "reviewers": { + "kedro-datasets/kedro_datasets/": { + "description": "Require at least 1/2 TSC approval on new core dataset contributions.", + "teams": ["everyone"], + "requiredApproverCount": 10 + } + } +} diff --git a/.github/workflows/check-tsc-vote.yml b/.github/workflows/check-tsc-vote.yml new file mode 100644 index 000000000..2cc742023 --- /dev/null +++ b/.github/workflows/check-tsc-vote.yml @@ -0,0 +1,14 @@ +name: Required Reviews +on: + pull_request: + paths: + - 'kedro-datasets/kedro_datasets/*' +jobs: + required-reviews: + name: Required Reviews + runs-on: ubuntu-latest + steps: + - name: required-reviewers + uses: theoremlp/required-reviews@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml index 77ad752de..ac5473240 100644 --- a/.github/workflows/merge-gatekeeper.yml +++ b/.github/workflows/merge-gatekeeper.yml @@ -24,3 +24,4 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} timeout: 2400 interval: 30 + ignored: 'Required Reviews' diff --git a/kedro-datasets/CONTRIBUTING.md b/kedro-datasets/CONTRIBUTING.md index 41454f598..8ef5ae52c 100644 --- a/kedro-datasets/CONTRIBUTING.md +++ b/kedro-datasets/CONTRIBUTING.md @@ -89,7 +89,7 @@ def count_truthy(elements: List[Any]) -> int: return sum(1 for elem in elements if elem) ``` -> *Note:* We only accept contributions under the [Apache 2.0](https://opensource.org/licenses/Apache-2.0) license, and you should have permission to share the submitted code. +> *Note:* We only accept contributions under the [Apache 2.0](https://github.com/kedro-org/kedro-plugins/blob/main/LICENSE.md) license, and you should have permission to share the submitted code. ### Branching conventions diff --git a/kedro-datasets/README.md b/kedro-datasets/README.md index aa4531e5b..d84b2bf7f 100644 --- a/kedro-datasets/README.md +++ b/kedro-datasets/README.md @@ -2,7 +2,7 @@ -[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/kedro-org/kedro-plugins/blob/main/LICENSE.md) [![Python Version](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg)](https://pypi.org/project/kedro-datasets/) [![PyPI Version](https://badge.fury.io/py/kedro-datasets.svg)](https://pypi.org/project/kedro-datasets/) [![Code Style: Black](https://img.shields.io/badge/code%20style-black-black.svg)](https://github.com/ambv/black) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 20915e597..5f4e4b78f 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,5 +1,7 @@ # Upcoming Release +# Release 5.1.0 + ## Major features and improvements - Added the following new **experimental** datasets: @@ -10,9 +12,13 @@ - Added the following new core datasets: -| Type | Description | Location | -| ------------------- | ------------------------------------------------------------- | --------------------- | -| `ibis.TableDataset` | A dataset for loading and saving files using Ibis's backends. | `kedro_datasets.ibis` | +| Type | Description | Location | +| ------------------ | ------------------------------------------------------------- | --------------------- | +| `ibis.FileDataset` | A dataset for loading and saving files using Ibis's backends. | `kedro_datasets.ibis` | + +## Bug fixes and other changes + +- Changed Ibis datasets to connect to an in-memory DuckDB database if connection configuration is not provided. ## Community contributions diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index 61425b9d2..44692e803 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,7 +1,7 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" __all__ = ["KedroDeprecationWarning"] -__version__ = "5.0.0" +__version__ = "5.1.0" import sys import warnings diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py index 11b58bc32..f204e297b 100644 --- a/kedro-datasets/kedro_datasets/ibis/file_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -66,6 +66,10 @@ class FileDataset(AbstractVersionedDataset[ir.Table, ir.Table]): """ + DEFAULT_CONNECTION_CONFIG: ClassVar[dict[str, Any]] = { + "backend": "duckdb", + "database": ":memory:", + } DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {} @@ -107,6 +111,7 @@ def __init__( # noqa: PLR0913 Defaults to writing execution results to a Parquet file. table_name: The name to use for the created table (on load). connection: Configuration for connecting to an Ibis backend. + If not provided, connect to DuckDB in in-memory mode. load_args: Additional arguments passed to the Ibis backend's `read_{file_format}` method. save_args: Additional arguments passed to the Ibis backend's @@ -120,7 +125,7 @@ def __init__( # noqa: PLR0913 """ self._file_format = file_format self._table_name = table_name - self._connection_config = connection + self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG self.metadata = metadata super().__init__( @@ -156,8 +161,7 @@ def hashable(value): import ibis config = deepcopy(self._connection_config) - backend_attr = config.pop("backend") if config else None - backend = getattr(ibis, backend_attr) + backend = getattr(ibis, config.pop("backend")) cls._connections[key] = backend.connect(**config) return cls._connections[key] @@ -178,9 +182,7 @@ def _describe(self) -> dict[str, Any]: "filepath": self._filepath, "file_format": self._file_format, "table_name": self._table_name, - "backend": self._connection_config.get("backend") - if self._connection_config - else None, + "backend": self._connection_config["backend"], "load_args": self._load_args, "save_args": self._save_args, "version": self._version, diff --git a/kedro-datasets/kedro_datasets/ibis/table_dataset.py b/kedro-datasets/kedro_datasets/ibis/table_dataset.py index 7550e6266..30709d08e 100644 --- a/kedro-datasets/kedro_datasets/ibis/table_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/table_dataset.py @@ -60,6 +60,10 @@ class TableDataset(AbstractDataset[ir.Table, ir.Table]): """ + DEFAULT_CONNECTION_CONFIG: ClassVar[dict[str, Any]] = { + "backend": "duckdb", + "database": ":memory:", + } DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = { "materialized": "view", @@ -99,6 +103,7 @@ def __init__( # noqa: PLR0913 Args: table_name: The name of the table or view to read or create. connection: Configuration for connecting to an Ibis backend. + If not provided, connect to DuckDB in in-memory mode. load_args: Additional arguments passed to the Ibis backend's `read_{file_format}` method. save_args: Additional arguments passed to the Ibis backend's @@ -126,7 +131,7 @@ def __init__( # noqa: PLR0913 self._filepath = filepath self._file_format = file_format self._table_name = table_name - self._connection_config = connection + self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG self.metadata = metadata # Set load and save arguments, overwriting defaults if provided. @@ -158,8 +163,7 @@ def hashable(value): import ibis config = deepcopy(self._connection_config) - backend_attr = config.pop("backend") if config else None - backend = getattr(ibis, backend_attr) + backend = getattr(ibis, config.pop("backend")) cls._connections[key] = backend.connect(**config) return cls._connections[key] @@ -186,9 +190,7 @@ def _describe(self) -> dict[str, Any]: "filepath": self._filepath, "file_format": self._file_format, "table_name": self._table_name, - "backend": self._connection_config.get("backend") - if self._connection_config - else None, + "backend": self._connection_config["backend"], "load_args": self._load_args, "save_args": self._save_args, "materialized": self._materialized, diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index b1effc278..294ea35cb 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -136,7 +136,7 @@ def save(self, data: DataFrame) -> None: ( output_constructor.option("checkpointLocation", checkpoint) .option("path", save_path) - .outputMode(output_mode) + .outputMode(str(output_mode)) .options(**self._save_args or {}) .start() ) diff --git a/kedro-datasets/tests/ibis/test_file_dataset.py b/kedro-datasets/tests/ibis/test_file_dataset.py index e598bffff..c21a06466 100644 --- a/kedro-datasets/tests/ibis/test_file_dataset.py +++ b/kedro-datasets/tests/ibis/test_file_dataset.py @@ -10,6 +10,8 @@ from kedro_datasets.ibis import FileDataset +_SENTINEL = object() + @pytest.fixture def filepath_csv(tmp_path): @@ -21,9 +23,13 @@ def database(tmp_path): return (tmp_path / "file.db").as_posix() -@pytest.fixture(params=[None]) +@pytest.fixture(params=[_SENTINEL]) def connection_config(request, database): - return request.param or {"backend": "duckdb", "database": database} + return ( + {"backend": "duckdb", "database": database} + if request.param is _SENTINEL # `None` is a valid value to test + else request.param + ) @pytest.fixture @@ -103,12 +109,23 @@ def test_save_extra_params( ("query", (("driver", "ODBC Driver 17 for SQL Server"),)), ), ), + # https://github.com/kedro-org/kedro-plugins/pull/893#discussion_r1804632435 + ( + None, + ( + ("backend", "duckdb"), + ("database", ":memory:"), + ), + ), ], indirect=["connection_config"], ) def test_connection_config(self, mocker, file_dataset, connection_config, key): """Test hashing of more complicated connection configuration.""" - mocker.patch(f"ibis.{connection_config['backend']}") + backend = ( + connection_config["backend"] if connection_config is not None else "duckdb" + ) + mocker.patch(f"ibis.{backend}") file_dataset.load() assert key in file_dataset._connections diff --git a/kedro-datasets/tests/ibis/test_table_dataset.py b/kedro-datasets/tests/ibis/test_table_dataset.py index 644bbc127..a778b08e0 100644 --- a/kedro-datasets/tests/ibis/test_table_dataset.py +++ b/kedro-datasets/tests/ibis/test_table_dataset.py @@ -6,6 +6,8 @@ from kedro_datasets.ibis import TableDataset +_SENTINEL = object() + @pytest.fixture(scope="session") def filepath_csv(tmp_path_factory): @@ -19,9 +21,13 @@ def database(tmp_path): return (tmp_path / "file.db").as_posix() -@pytest.fixture(params=[None]) +@pytest.fixture(params=[_SENTINEL]) def connection_config(request, database): - return request.param or {"backend": "duckdb", "database": database} + return ( + {"backend": "duckdb", "database": database} + if request.param is _SENTINEL # `None` is a valid value to test + else request.param + ) @pytest.fixture @@ -122,11 +128,22 @@ def test_save_no_table_name(self, table_dataset_from_csv, dummy_table): ("query", (("driver", "ODBC Driver 17 for SQL Server"),)), ), ), + # https://github.com/kedro-org/kedro-plugins/pull/893#discussion_r1804632435 + ( + None, + ( + ("backend", "duckdb"), + ("database", ":memory:"), + ), + ), ], indirect=["connection_config"], ) def test_connection_config(self, mocker, table_dataset, connection_config, key): """Test hashing of more complicated connection configuration.""" - mocker.patch(f"ibis.{connection_config['backend']}") + backend = ( + connection_config["backend"] if connection_config is not None else "duckdb" + ) + mocker.patch(f"ibis.{backend}") table_dataset.load() assert key in table_dataset._connections diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature index f3682fd30..2d54de7f3 100644 --- a/kedro-docker/features/docker.feature +++ b/kedro-docker/features/docker.feature @@ -100,7 +100,7 @@ Feature: Docker commands in new projects Scenario: Execute docker run target without building image When I execute the kedro command "docker run" - Then I should get a successful exit code + Then I should get an error exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." Scenario: Execute docker dive target @@ -118,5 +118,5 @@ Feature: Docker commands in new projects Scenario: Execute docker dive without building image When I execute the kedro command "docker dive" - Then I should get a successful exit code + Then I should get an error exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally."