Skip to content

Commit

Permalink
Merge branch 'feature/databricks_external_dataset' of https://github.…
Browse files Browse the repository at this point in the history
…com/MinuraPunchihewa/kedro-plugins into feature/databricks_external_dataset
  • Loading branch information
MinuraPunchihewa committed Oct 4, 2024
2 parents e4d8ea1 + 6ec48b0 commit 4941ab5
Show file tree
Hide file tree
Showing 43 changed files with 968 additions and 211 deletions.
28 changes: 10 additions & 18 deletions .gitpod.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart
image: gitpod/workspace-python-3.10:2023-04-20-16-32-37

image: gitpod/workspace-python-3.11

tasks:
# We want packages installed during the pre-build init steps to go to /workspace
Expand All @@ -12,22 +10,16 @@ tasks:
echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no
init: |
make sign-off
pip install uv
uv venv
echo source .venv/bin/activate >> ~/.bashrc
source ~/.bashrc
make install-test-requirements plugin=kedro-datasets
command: |
pre-commit install --install-hooks
clear
github:
prebuilds:
# enable for the master/default branch (defaults to true)
master: true
# enable for all branches in this repo (defaults to false)
branches: true
# enable for pull requests coming from this repo (defaults to true)
pullRequests: true
# enable for pull requests coming from forks (defaults to false)
pullRequestsFromForks: true
# add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
addComment: false
# add a "Review in Gitpod" button to pull requests (defaults to false)
addBadge: true
- name: system
init: |
sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 make
sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils
68 changes: 26 additions & 42 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,6 @@ package:
rm -Rf dist;\
python -m build

pypi:
python -m pip install twine -U
python -m twine upload $(plugin)/dist/*

install: package
cd $(plugin) && pip install -U dist/*.whl

install-pip-setuptools:
python -m pip install -U pip setuptools wheel

Expand All @@ -25,46 +18,14 @@ mypy:
test:
cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile

# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
dataset-tests: dataset-doctests
cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov

extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
extra_pytest_args=
dataset-doctest%:
if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
echo "make: *** No rule to make target \`${@}\`. Stop."; \
exit 2; \
fi; \
\
# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
--ignore kedro_datasets/pandas/gbq_dataset.py \
--ignore kedro_datasets/partitions/partitioned_dataset.py \
--ignore kedro_datasets/redis/redis_dataset.py \
--ignore kedro_datasets/snowflake/snowpark_dataset.py \
--ignore kedro_datasets/spark/spark_hive_dataset.py \
--ignore kedro_datasets/spark/spark_jdbc_dataset.py \
$(extra_pytest_arg${*})

test-sequential:
cd $(plugin) && pytest tests --cov-config pyproject.toml

e2e-tests:
cd $(plugin) && behave

secret-scan:
trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt .

clean:
cd $(plugin);\
rm -rf build dist pip-wheel-metadata .pytest_cache;\
find . -regex ".*/__pycache__" -exec rm -rf {} +;\
find . -regex ".*\.egg-info" -exec rm -rf {} +;\

install-test-requirements:
cd $(plugin) && pip install ".[test]"
cd $(plugin) && uv pip install ".[test]"

install-pre-commit:
pre-commit install --install-hooks
Expand All @@ -79,12 +40,12 @@ sign-off:
echo '--in-place "$$1"' >> .git/hooks/commit-msg
chmod +x .git/hooks/commit-msg

## kedro-datasets specific

# kedro-datasets related only
test-no-spark: dataset-doctests-no-spark
cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile

test-no-spark-sequential: dataset-doctests-no-spark
cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks

# kedro-datasets/snowflake tests skipped from default scope
test-snowflake-only:
Expand All @@ -93,3 +54,26 @@ test-snowflake-only:

check-datasets-docs:
cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck

# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
dataset-tests: dataset-doctests
cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov

extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
extra_pytest_args=
dataset-doctest%:
if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
echo "make: *** No rule to make target \`${@}\`. Stop."; \
exit 2; \
fi; \
\
# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
--ignore kedro_datasets/pandas/gbq_dataset.py \
--ignore kedro_datasets/partitions/partitioned_dataset.py \
--ignore kedro_datasets/redis/redis_dataset.py \
--ignore kedro_datasets/snowflake/snowpark_dataset.py \
--ignore kedro_datasets/spark/spark_hive_dataset.py \
--ignore kedro_datasets/spark/spark_jdbc_dataset.py \
$(extra_pytest_arg${*})
13 changes: 10 additions & 3 deletions kedro-airflow/kedro_airflow/grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,21 @@
from kedro.pipeline.node import Node
from kedro.pipeline.pipeline import Pipeline

try:
from kedro.io import CatalogProtocol
except ImportError: # pragma: no cover
pass


def _is_memory_dataset(catalog, dataset_name: str) -> bool:
if dataset_name not in catalog:
return True
return False


def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]:
def get_memory_datasets(
catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline
) -> set[str]:
"""Gather all datasets in the pipeline that are of type MemoryDataset, excluding 'parameters'."""
return {
dataset_name
Expand All @@ -21,7 +28,7 @@ def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]:


def create_adjacency_list(
catalog: DataCatalog, pipeline: Pipeline
catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline
) -> tuple[dict[str, set], dict[str, set]]:
"""
Builds adjacency list (adj_list) to search connected components - undirected graph,
Expand All @@ -48,7 +55,7 @@ def create_adjacency_list(


def group_memory_nodes(
catalog: DataCatalog, pipeline: Pipeline
catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline
) -> tuple[dict[str, list[Node]], dict[str, list[str]]]:
"""
Nodes that are connected through MemoryDatasets cannot be distributed across
Expand Down
9 changes: 9 additions & 0 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
| Type | Description | Location |
|-------------------------------------|-----------------------------------------------------------|-----------------------------------------|
| `pytorch.PyTorchDataset` | A dataset for securely saving and loading PyTorch models | `kedro_datasets_experimental.pytorch` |
| `prophet.ProphetModelDataset` | A dataset for Meta's Prophet model for time series forecasting | `kedro_datasets_experimental.prophet` |


* Added the following new core datasets:

Expand All @@ -14,12 +16,19 @@

## Bug fixes and other changes
* Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods.
* Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`.
* Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib
* Fixed incorrect `pandas` optional dependency

## Breaking Changes
## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
* [Brandon Meek](https://github.com/bpmeek)
* [yury-fedotov](https://github.com/yury-fedotov)
* [gitgud5000](https://github.com/gitgud5000)
* [janickspirig](https://github.com/janickspirig)
* [Galen Seilis](https://github.com/galenseilis)
* [Mariusz Wojakowski](https://github.com/mariusz89016)


# Release 4.1.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ kedro_datasets_experimental
langchain.ChatOpenAIDataset
langchain.OpenAIEmbeddingsDataset
netcdf.NetCDFDataset
prophet.ProphetModelDataset
pytorch.PyTorchDataset
rioxarray.GeoTIFFDataset
2 changes: 2 additions & 0 deletions kedro-datasets/docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@
"xarray.core.dataset.Dataset",
"xarray.core.dataarray.DataArray",
"torch.nn.modules.module.Module",
"prophet.forecaster.Prophet",
"Prophet",
),
"py:data": (
"typing.Any",
Expand Down
20 changes: 11 additions & 9 deletions kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import fsspec
import pandas as pd
import pandas_gbq as pd_gbq
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from google.oauth2.credentials import Credentials
Expand Down Expand Up @@ -138,16 +139,17 @@ def _describe(self) -> dict[str, Any]:

def _load(self) -> pd.DataFrame:
sql = f"select * from {self._dataset}.{self._table_name}" # nosec
self._load_args.setdefault("query", sql)
return pd.read_gbq(
self._load_args.setdefault("query_or_table", sql)
return pd_gbq.read_gbq(
project_id=self._project_id,
credentials=self._credentials,
**self._load_args,
)

def _save(self, data: pd.DataFrame) -> None:
data.to_gbq(
f"{self._dataset}.{self._table_name}",
pd_gbq.to_gbq(
dataframe=data,
destination_table=f"{self._dataset}.{self._table_name}",
project_id=self._project_id,
credentials=self._credentials,
**self._save_args,
Expand Down Expand Up @@ -176,7 +178,7 @@ def _validate_location(self):

class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]):
"""``GBQQueryDataset`` loads data from a provided SQL query from Google
BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq``
BigQuery. It uses ``pandas_gbq.read_gbq`` which itself uses ``pandas-gbq``
internally to read from BigQuery table. Therefore it supports all allowed
pandas options on ``read_gbq``.
Expand Down Expand Up @@ -274,7 +276,7 @@ def __init__( # noqa: PLR0913

# load sql query from arg or from file
if sql:
self._load_args["query"] = sql
self._load_args["query_or_table"] = sql
self._filepath = None
else:
# filesystem for loading sql file
Expand All @@ -291,7 +293,7 @@ def __init__( # noqa: PLR0913
def _describe(self) -> dict[str, Any]:
load_args = copy.deepcopy(self._load_args)
desc = {}
desc["sql"] = str(load_args.pop("query", None))
desc["sql"] = str(load_args.pop("query_or_table", None))
desc["filepath"] = str(self._filepath)
desc["load_args"] = str(load_args)

Expand All @@ -303,9 +305,9 @@ def _load(self) -> pd.DataFrame:
if self._filepath:
load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol)
with self._fs.open(load_path, mode="r") as fs_file:
load_args["query"] = fs_file.read()
load_args["query_or_table"] = fs_file.read()

return pd.read_gbq(
return pd_gbq.read_gbq(
project_id=self._project_id,
credentials=self._credentials,
**load_args,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def _load(self) -> tf.keras.Model:
# We assume .keras
path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE) # noqa: PLW2901

self._fs.copy(load_path, path)
self._fs.get(load_path, path)

# Pass the local temporary directory/file path to keras.load_model
device_name = self._load_args.pop("tf_device", None)
Expand All @@ -169,7 +169,7 @@ def _save(self, data: tf.keras.Model) -> None:

# Use fsspec to take from local tempfile directory/file and
# put in ArbitraryFileSystem
self._fs.copy(path, save_path)
self._fs.put(path, save_path)

def _exists(self) -> bool:
try:
Expand Down
11 changes: 11 additions & 0 deletions kedro-datasets/kedro_datasets_experimental/prophet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""``JSONDataset`` implementation to load/save data from/to a Prophet model file."""

from typing import Any

import lazy_loader as lazy

ProphetDataset: Any

__getattr__, __dir__, __all__ = lazy.attach(
__name__, submod_attrs={"prophet_dataset": ["ProphetModelDataset"]}
)
Loading

0 comments on commit 4941ab5

Please sign in to comment.