Merge branch 'main' into chore/deprecate-abstract-data-set

kedro-org · Jul 6, 2023 · feef676 · feef676
2 parents 8c59dc3 + 9500d60
commit feef676
Show file tree

Hide file tree

Showing 12 changed files with 662 additions and 63 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -1,2 +1,2 @@
-*              @idanov
+*              @merelcht
 docs/          @yetudada @astrojuanlu @stichbury
diff --git a/RELEASE.md b/RELEASE.md
@@ -3,7 +3,6 @@
 ## Major features and improvements
 
 ## Bug fixes and other changes
-* Compare for protocol and delimiter in `PartitionedDataSet` to be able to pass the protocol to partitions which paths starts with the same characters as the protocol (e.g. `s3://s3-my-bucket`).
 
 ## Breaking changes to the API
 
@@ -12,6 +11,7 @@
 # Upcoming Release 0.18.12
 
 ## Major features and improvements
+* Added dataset factories feature which uses pattern matching to reduce the number of catalog entries.
 
 ## Bug fixes and other changes
 
@@ -35,6 +35,7 @@
 ## Bug fixes and other changes
 * Reworked micropackaging workflow to use standard Python packaging practices.
 * Make `kedro micropkg package` accept `--verbose`.
+* Compare for protocol and delimiter in `PartitionedDataSet` to be able to pass the protocol to partitions which paths starts with the same characters as the protocol (e.g. `s3://s3-my-bucket`).
 
 ## Documentation changes
 * Significant improvements to the documentation that covers working with Databricks and Kedro, including a new page for workspace-only development, and a guide to choosing the best workflow for your use case.

diff --git a/dependency/requirements.txt b/dependency/requirements.txt
@@ -13,8 +13,9 @@ importlib_resources>=1.3  # The `files()` API was introduced in `importlib_resou
 jmespath>=0.9.5, <1.0
 more_itertools~=9.0
 omegaconf~=2.3
+parse~=1.19.0
 pip-tools~=6.5
-pluggy~=1.0.0
+pluggy~=1.0
 PyYAML>=4.2, <7.0
 rich>=12.0, <14.0
 rope>=0.21, <2.0  # subject to LGPLv3 license

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -202,9 +202,6 @@
 # to ignore or allow certain links.
 html_extra_path = [str(here / "robots.txt")]
 
-# Likewise for search console verification
-html_extra_path = [str(here / "googlebce3ad2fda582ae8.html")]
-
 # Removes, from all docs, the copyright footer.
 html_show_copyright = False
 

diff --git a/docs/source/data/data_catalog.md b/docs/source/data/data_catalog.md
@@ -404,7 +404,7 @@ CSVDataSet(
 ```
 
 
-## Load multiple datasets with similar configuration
+## Load multiple datasets with similar configuration using YAML anchors
 
 Different datasets might use the same file format, load and save arguments, and be stored in the same folder. [YAML has a built-in syntax](https://yaml.org/spec/1.2.1/#Syntax) for factorising parts of a YAML file, which means that you can decide what is generalisable across your datasets, so that you need not spend time copying and pasting dataset configurations in the `catalog.yml` file.
 
@@ -461,6 +461,211 @@ airplanes:
 
 In this example, the default `csv` configuration is inserted into `airplanes` and then the `load_args` block is overridden. Normally, that would replace the whole dictionary. In order to extend `load_args`, the defaults for that block are then re-inserted.
 
+## Load multiple datasets with similar configuration using dataset factories
+For catalog entries that share configuration details, you can also use the dataset factories introduced in Kedro 0.18.11. This syntax allows you to generalise the configuration and
+reduce the number of similar catalog entries by matching datasets used in your project's pipelines to dataset factory patterns.
+
+### Example 1: Generalise datasets with similar names and types into one dataset factory
+Consider the following catalog entries:
+```yaml
+factory_data:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/factory_data.csv
+
+
+process_data:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/process_data.csv
+```
+The datasets in this catalog can be generalised to the following dataset factory:
+```yaml
+"{name}_data":
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/{name}_data.csv
+```
+When `factory_data` or `process_data` is used in your pipeline, it is matched to the factory pattern `{name}_data`. The factory pattern must always be enclosed in
+quotes to avoid YAML parsing errors.
+
+
+### Example 2: Generalise datasets of the same type into one dataset factory
+You can also combine all the datasets with the same type and configuration details. For example, consider the following
+catalog with three datasets named `boats`, `cars` and `planes` of the type `pandas.CSVDataSet`:
+```yaml
+boats:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/shuttles.csv
+
+cars:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/reviews.csv
+
+planes:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/companies.csv
+```
+These datasets can be combined into the following dataset factory:
+```yaml
+"{dataset_name}#csv":
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/{dataset_name}.csv
+```
+You will then have to update the pipelines in your project located at `src/<project_name>/<pipeline_name>/pipeline.py` to refer to these datasets as `boats#csv`,
+`cars#csv` and `planes#csv`. Adding a suffix or a prefix to the dataset names and the dataset factory patterns, like `#csv` here, ensures that the dataset
+names are matched with the intended pattern.
+```python
+from .nodes import create_model_input_table, preprocess_companies, preprocess_shuttles
+
+
+def create_pipeline(**kwargs) -> Pipeline:
+    return pipeline(
+        [
+            node(
+                func=preprocess_boats,
+                inputs="boats#csv",
+                outputs="preprocessed_boats",
+                name="preprocess_boats_node",
+            ),
+            node(
+                func=preprocess_cars,
+                inputs="cars#csv",
+                outputs="preprocessed_cars",
+                name="preprocess_cars_node",
+            ),
+            node(
+                func=preprocess_planes,
+                inputs="planes#csv",
+                outputs="preprocessed_planes",
+                name="preprocess_planes_node",
+            ),
+            node(
+                func=create_model_input_table,
+                inputs=[
+                    "preprocessed_boats",
+                    "preprocessed_planes",
+                    "preprocessed_cars",
+                ],
+                outputs="model_input_table",
+                name="create_model_input_table_node",
+            ),
+        ]
+    )
+```
+### Example 3: Generalise datasets using namespaces into one dataset factory
+You can also generalise the catalog entries for datasets belonging to namespaced modular pipelines. Consider the
+following pipeline which takes in a `model_input_table` and outputs two regressors belonging to the
+`active_modelling_pipeline` and the `candidate_modelling_pipeline` namespaces:
+```python
+from kedro.pipeline import Pipeline, node
+from kedro.pipeline.modular_pipeline import pipeline
+
+from .nodes import evaluate_model, split_data, train_model
+
+
+def create_pipeline(**kwargs) -> Pipeline:
+    pipeline_instance = pipeline(
+        [
+            node(
+                func=split_data,
+                inputs=["model_input_table", "params:model_options"],
+                outputs=["X_train", "y_train"],
+                name="split_data_node",
+            ),
+            node(
+                func=train_model,
+                inputs=["X_train", "y_train"],
+                outputs="regressor",
+                name="train_model_node",
+            ),
+        ]
+    )
+    ds_pipeline_1 = pipeline(
+        pipe=pipeline_instance,
+        inputs="model_input_table",
+        namespace="active_modelling_pipeline",
+    )
+    ds_pipeline_2 = pipeline(
+        pipe=pipeline_instance,
+        inputs="model_input_table",
+        namespace="candidate_modelling_pipeline",
+    )
+
+    return ds_pipeline_1 + ds_pipeline_2
+```
+You can now have one dataset factory pattern in your catalog instead of two separate entries for `active_modelling_pipeline.regressor`
+and `candidate_modelling_pipeline.regressor` as below:
+```yaml
+{namespace}.regressor:
+  type: pickle.PickleDataSet
+  filepath: data/06_models/regressor_{namespace}.pkl
+  versioned: true
+```
+### Example 4: Generalise datasets of the same type in different layers into one dataset factory with multiple placeholders
+
+You can use multiple placeholders in the same pattern. For example, consider the following catalog where the dataset
+entries share `type`, `file_format` and `save_args`:
+```yaml
+processing.factory_data:
+  type: spark.SparkDataSet
+  filepath: data/processing/factory_data.pq
+  file_format: parquet
+  save_args:
+    mode: overwrite
+
+processing.process_data:
+  type: spark.SparkDataSet
+  filepath: data/processing/process_data.pq
+  file_format: parquet
+  save_args:
+    mode: overwrite
+
+modelling.metrics:
+  type: spark.SparkDataSet
+  filepath: data/modelling/factory_data.pq
+  file_format: parquet
+  save_args:
+    mode: overwrite
+```
+This could be generalised to the following pattern:
+```yaml
+"{layer}.{dataset_name}":
+  type: spark.SparkDataSet
+  filepath: data/{layer}/{dataset_name}.pq
+  file_format: parquet
+  save_args:
+    mode: overwrite
+```
+All the placeholders used in the catalog entry body must exist in the factory pattern name.
+
+### Example 5: Generalise datasets using multiple dataset factories
+You can have multiple dataset factories in your catalog. For example:
+```yaml
+"{namespace}.{dataset_name}@spark":
+  type: spark.SparkDataSet
+  filepath: data/{namespace}/{dataset_name}.pq
+  file_format: parquet
+
+"{dataset_name}@csv":
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/{dataset_name}.csv
+```
+
+Having multiple dataset factories in your catalog can lead to a situation where a dataset name from your pipeline might
+match multiple patterns. To overcome this, Kedro sorts all the potential matches for the dataset name in the pipeline and picks the best match.
+The matches are ranked according to the following criteria :
+1. Number of exact character matches between the dataset name and the factory pattern. For example, a dataset named `factory_data$csv` would match `{dataset}_data$csv` over `{dataset_name}$csv`.
+2. Number of placeholders. For example, the dataset `preprocessing.shuttles+csv` would match `{namespace}.{dataset}+csv` over `{dataset}+csv`.
+3. Alphabetical order
+
+### Example 6: Generalise all datasets with a catch-all dataset factory to overwrite the default `MemoryDataSet`
+You can use dataset factories to define a catch-all pattern which will overwrite the default `MemoryDataSet` creation.
+```yaml
+"{default_dataset}":
+  type: pandas.CSVDataSet
+  filepath: data/{default_dataset}.csv
+
+```
+Kedro will now treat all the datasets mentioned in your project's pipelines that do not appear as specific patterns or explicit entries in your catalog
+as `pandas.CSVDataSet`.
 
 ## Transcode datasets
 

diff --git a/docs/source/googlebce3ad2fda582ae8.html b/docs/source/googlebce3ad2fda582ae8.html
diff --git a/kedro/framework/cli/cli.py b/kedro/framework/cli/cli.py
@@ -131,7 +131,7 @@ def main(
         # subcommand, arguments and options. click doesn't store this information anywhere
         # so we have to re-do it.
         args = sys.argv[1:] if args is None else list(args)
-        self._cli_hook_manager.hook.before_command_run(  # pylint: disable=no-member
+        self._cli_hook_manager.hook.before_command_run(
             project_metadata=self._metadata, command_args=args
         )
 
@@ -146,7 +146,7 @@ def main(
         # click.core.main() method exits by default, we capture this and then
         # exit as originally intended
         except SystemExit as exc:
-            self._cli_hook_manager.hook.after_command_run(  # pylint: disable=no-member
+            self._cli_hook_manager.hook.after_command_run(
                 project_metadata=self._metadata, command_args=args, exit_code=exc.code
             )
             sys.exit(exc.code)

diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py
@@ -272,9 +272,7 @@ def load_context(self) -> KedroContext:
             extra_params=extra_params,
             hook_manager=self._hook_manager,
         )
-        self._hook_manager.hook.after_context_created(  # pylint: disable=no-member
-            context=context
-        )
+        self._hook_manager.hook.after_context_created(context=context)
 
         return context
 
@@ -354,7 +352,6 @@ def run(  # pylint: disable=too-many-arguments,too-many-locals
             These are returned in a dictionary, where the keys are defined
             by the node outputs.
         """
-        # pylint: disable=protected-access,no-member
         # Report project name
         self._logger.info("Kedro project %s", self._project_path.name)
 
@@ -409,15 +406,20 @@ def run(  # pylint: disable=too-many-arguments,too-many-locals
             "runner": getattr(runner, "__name__", str(runner)),
         }
 
-        catalog = context._get_catalog(
+        catalog = context._get_catalog(  # pylint: disable=protected-access
             save_version=save_version,
             load_versions=load_versions,
         )
 
         # Run the runner
         hook_manager = self._hook_manager
         runner = runner or SequentialRunner()
-        hook_manager.hook.before_pipeline_run(  # pylint: disable=no-member
+        if not isinstance(runner, AbstractRunner):
+            raise KedroSessionError(
+                "KedroSession expect an instance of Runner instead of a class."
+                "Have you forgotten the `()` at the end of the statement?"
+            )
+        hook_manager.hook.before_pipeline_run(
             run_params=record_data, pipeline=filtered_pipeline, catalog=catalog
         )