Merge branch 'feature/databricks_external_dataset' of https://github.…

…com/MinuraPunchihewa/kedro-plugins into feature/databricks_external_dataset
kedro-org · Oct 4, 2024 · 4941ab5 · 4941ab5
2 parents e4d8ea1 + 6ec48b0
commit 4941ab5
Show file tree

Hide file tree

Showing 43 changed files with 968 additions and 211 deletions.
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -1,6 +1,4 @@
-# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart
-image: gitpod/workspace-python-3.10:2023-04-20-16-32-37
-
+image: gitpod/workspace-python-3.11
 
 tasks:
   # We want packages installed during the pre-build init steps to go to /workspace
@@ -12,22 +10,16 @@ tasks:
       echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no
     init: |
       make sign-off
+      pip install uv
+      uv venv
+      echo source .venv/bin/activate >> ~/.bashrc
+      source ~/.bashrc
+      make install-test-requirements plugin=kedro-datasets
     command: |
       pre-commit install --install-hooks
       clear
 
-
-github:
-  prebuilds:
-    # enable for the master/default branch (defaults to true)
-    master: true
-    # enable for all branches in this repo (defaults to false)
-    branches: true
-    # enable for pull requests coming from this repo (defaults to true)
-    pullRequests: true
-    # enable for pull requests coming from forks (defaults to false)
-    pullRequestsFromForks: true
-    # add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
-    addComment: false
-    # add a "Review in Gitpod" button to pull requests (defaults to false)
-    addBadge: true
+  - name: system
+    init: |
+      sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 make
+      sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils
diff --git a/Makefile b/Makefile
@@ -5,13 +5,6 @@ package:
 	rm -Rf dist;\
 	python -m build
 
-pypi:
-	python -m pip install twine -U
-	python -m twine upload $(plugin)/dist/*
-
-install: package
-	cd $(plugin) && pip install -U dist/*.whl
-
 install-pip-setuptools:
 	python -m pip install -U pip setuptools wheel
 
@@ -25,46 +18,14 @@ mypy:
 test:
 	cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile
 
-# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
-dataset-tests: dataset-doctests
-	cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
-	cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py  --no-cov
-
-extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
-extra_pytest_args=
-dataset-doctest%:
-	if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
-	  echo "make: *** No rule to make target \`${@}\`.  Stop."; \
-	  exit 2; \
-	fi; \
-    \
-	# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
-	cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
-	  --ignore kedro_datasets/pandas/gbq_dataset.py \
-	  --ignore kedro_datasets/partitions/partitioned_dataset.py \
-	  --ignore kedro_datasets/redis/redis_dataset.py \
-	  --ignore kedro_datasets/snowflake/snowpark_dataset.py \
-	  --ignore kedro_datasets/spark/spark_hive_dataset.py \
-	  --ignore kedro_datasets/spark/spark_jdbc_dataset.py \
-	  $(extra_pytest_arg${*})
-
-test-sequential:
-	cd $(plugin) && pytest tests --cov-config pyproject.toml
-
 e2e-tests:
 	cd $(plugin) && behave
 
 secret-scan:
 	trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt .
 
-clean:
-	cd $(plugin);\
-	rm -rf build dist pip-wheel-metadata .pytest_cache;\
-	find . -regex ".*/__pycache__" -exec rm -rf {} +;\
-	find . -regex ".*\.egg-info" -exec rm -rf {} +;\
-
 install-test-requirements:
-	cd $(plugin) && pip install ".[test]"
+	cd $(plugin) && uv pip install ".[test]"
 
 install-pre-commit:
 	pre-commit install --install-hooks
@@ -79,12 +40,12 @@ sign-off:
 	echo '--in-place "$$1"' >> .git/hooks/commit-msg
 	chmod +x .git/hooks/commit-msg
 
+## kedro-datasets specific
+
 # kedro-datasets related only
 test-no-spark: dataset-doctests-no-spark
 	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile
 
-test-no-spark-sequential: dataset-doctests-no-spark
-	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks
 
 # kedro-datasets/snowflake tests skipped from default scope
 test-snowflake-only:
@@ -93,3 +54,26 @@ test-snowflake-only:
 
 check-datasets-docs:
 	cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck
+
+# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
+dataset-tests: dataset-doctests
+	cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
+	cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py  --no-cov
+
+extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
+extra_pytest_args=
+dataset-doctest%:
+	if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
+	  echo "make: *** No rule to make target \`${@}\`.  Stop."; \
+	  exit 2; \
+	fi; \
+    \
+	# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
+	cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
+	  --ignore kedro_datasets/pandas/gbq_dataset.py \
+	  --ignore kedro_datasets/partitions/partitioned_dataset.py \
+	  --ignore kedro_datasets/redis/redis_dataset.py \
+	  --ignore kedro_datasets/snowflake/snowpark_dataset.py \
+	  --ignore kedro_datasets/spark/spark_hive_dataset.py \
+	  --ignore kedro_datasets/spark/spark_jdbc_dataset.py \
+	  $(extra_pytest_arg${*})
diff --git a/kedro-airflow/kedro_airflow/grouping.py b/kedro-airflow/kedro_airflow/grouping.py
@@ -4,14 +4,21 @@
 from kedro.pipeline.node import Node
 from kedro.pipeline.pipeline import Pipeline
 
+try:
+    from kedro.io import CatalogProtocol
+except ImportError:  # pragma: no cover
+    pass
+
 
 def _is_memory_dataset(catalog, dataset_name: str) -> bool:
     if dataset_name not in catalog:
         return True
     return False
 
 
-def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]:
+def get_memory_datasets(
+    catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline
+) -> set[str]:
     """Gather all datasets in the pipeline that are of type MemoryDataset, excluding 'parameters'."""
     return {
         dataset_name
@@ -21,7 +28,7 @@ def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]:
 
 
 def create_adjacency_list(
-    catalog: DataCatalog, pipeline: Pipeline
+    catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline
 ) -> tuple[dict[str, set], dict[str, set]]:
     """
     Builds adjacency list (adj_list) to search connected components - undirected graph,
@@ -48,7 +55,7 @@ def create_adjacency_list(
 
 
 def group_memory_nodes(
-    catalog: DataCatalog, pipeline: Pipeline
+    catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline
 ) -> tuple[dict[str, list[Node]], dict[str, list[str]]]:
     """
     Nodes that are connected through MemoryDatasets cannot be distributed across

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -5,6 +5,8 @@
 | Type                                | Description                                               | Location                                |
 |-------------------------------------|-----------------------------------------------------------|-----------------------------------------|
 | `pytorch.PyTorchDataset`            | A dataset for securely saving and loading PyTorch models  | `kedro_datasets_experimental.pytorch`   |
+| `prophet.ProphetModelDataset`       | A dataset for Meta's Prophet model for time series forecasting | `kedro_datasets_experimental.prophet`   |
+
 
 * Added the following new core datasets:
 
@@ -14,12 +16,19 @@
 
 ## Bug fixes and other changes
 * Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods.
+* Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`.
+* Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib
+* Fixed incorrect `pandas` optional dependency
 
 ## Breaking Changes
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:
 * [Brandon Meek](https://github.com/bpmeek)
 * [yury-fedotov](https://github.com/yury-fedotov)
+* [gitgud5000](https://github.com/gitgud5000)
+* [janickspirig](https://github.com/janickspirig)
+* [Galen Seilis](https://github.com/galenseilis)
+* [Mariusz Wojakowski](https://github.com/mariusz89016)
 
 
 # Release 4.1.0

diff --git a/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst b/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst
@@ -16,5 +16,6 @@ kedro_datasets_experimental
    langchain.ChatOpenAIDataset
    langchain.OpenAIEmbeddingsDataset
    netcdf.NetCDFDataset
+   prophet.ProphetModelDataset
    pytorch.PyTorchDataset
    rioxarray.GeoTIFFDataset
diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py
@@ -140,6 +140,8 @@
         "xarray.core.dataset.Dataset",
         "xarray.core.dataarray.DataArray",
         "torch.nn.modules.module.Module",
+        "prophet.forecaster.Prophet",
+        "Prophet",
     ),
     "py:data": (
         "typing.Any",

diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
@@ -10,6 +10,7 @@
 
 import fsspec
 import pandas as pd
+import pandas_gbq as pd_gbq
 from google.cloud import bigquery
 from google.cloud.exceptions import NotFound
 from google.oauth2.credentials import Credentials
@@ -138,16 +139,17 @@ def _describe(self) -> dict[str, Any]:
 
     def _load(self) -> pd.DataFrame:
         sql = f"select * from {self._dataset}.{self._table_name}"  # nosec
-        self._load_args.setdefault("query", sql)
-        return pd.read_gbq(
+        self._load_args.setdefault("query_or_table", sql)
+        return pd_gbq.read_gbq(
             project_id=self._project_id,
             credentials=self._credentials,
             **self._load_args,
         )
 
     def _save(self, data: pd.DataFrame) -> None:
-        data.to_gbq(
-            f"{self._dataset}.{self._table_name}",
+        pd_gbq.to_gbq(
+            dataframe=data,
+            destination_table=f"{self._dataset}.{self._table_name}",
             project_id=self._project_id,
             credentials=self._credentials,
             **self._save_args,
@@ -176,7 +178,7 @@ def _validate_location(self):
 
 class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]):
     """``GBQQueryDataset`` loads data from a provided SQL query from Google
-    BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq``
+    BigQuery. It uses ``pandas_gbq.read_gbq`` which itself uses ``pandas-gbq``
     internally to read from BigQuery table. Therefore it supports all allowed
     pandas options on ``read_gbq``.
 
@@ -274,7 +276,7 @@ def __init__(  # noqa: PLR0913
 
         # load sql query from arg or from file
         if sql:
-            self._load_args["query"] = sql
+            self._load_args["query_or_table"] = sql
             self._filepath = None
         else:
             # filesystem for loading sql file
@@ -291,7 +293,7 @@ def __init__(  # noqa: PLR0913
     def _describe(self) -> dict[str, Any]:
         load_args = copy.deepcopy(self._load_args)
         desc = {}
-        desc["sql"] = str(load_args.pop("query", None))
+        desc["sql"] = str(load_args.pop("query_or_table", None))
         desc["filepath"] = str(self._filepath)
         desc["load_args"] = str(load_args)
 
@@ -303,9 +305,9 @@ def _load(self) -> pd.DataFrame:
         if self._filepath:
             load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol)
             with self._fs.open(load_path, mode="r") as fs_file:
-                load_args["query"] = fs_file.read()
+                load_args["query_or_table"] = fs_file.read()
 
-        return pd.read_gbq(
+        return pd_gbq.read_gbq(
             project_id=self._project_id,
             credentials=self._credentials,
             **load_args,

diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py
@@ -144,7 +144,7 @@ def _load(self) -> tf.keras.Model:
                 # We assume .keras
                 path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE)  # noqa: PLW2901
 
-            self._fs.copy(load_path, path)
+            self._fs.get(load_path, path)
 
             # Pass the local temporary directory/file path to keras.load_model
             device_name = self._load_args.pop("tf_device", None)
@@ -169,7 +169,7 @@ def _save(self, data: tf.keras.Model) -> None:
 
             # Use fsspec to take from local tempfile directory/file and
             # put in ArbitraryFileSystem
-            self._fs.copy(path, save_path)
+            self._fs.put(path, save_path)
 
     def _exists(self) -> bool:
         try:

diff --git a/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py b/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py
@@ -0,0 +1,11 @@
+"""``JSONDataset`` implementation to load/save data from/to a Prophet model file."""
+
+from typing import Any
+
+import lazy_loader as lazy
+
+ProphetDataset: Any
+
+__getattr__, __dir__, __all__ = lazy.attach(
+    __name__, submod_attrs={"prophet_dataset": ["ProphetModelDataset"]}
+)