Merge branch 'main' into dependabot/pip/more-itertools-gte-9-and-lt-11

kedro-org · Jul 28, 2023 · 7070e10 · 7070e10
2 parents 66ff3d0 + 12d5e35
commit 7070e10
Show file tree

Hide file tree

Showing 13 changed files with 260 additions and 16 deletions.
diff --git a/.github/workflows/all-checks.yml b/.github/workflows/all-checks.yml
@@ -4,11 +4,13 @@ on:
   push:
     branches:
       - main
+      - develop
     paths-ignore:
       - "docs/**"
   pull_request:
     branches:
       - main
+      - develop
     paths-ignore:
       - "docs/**"
 
@@ -42,3 +44,13 @@ jobs:
     with:
       os: ${{ matrix.os }}
       python-version: ${{ matrix.python-version }}
+
+  pip-compile:
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, windows-latest ]
+        python-version: [ "3.7", "3.8", "3.9", "3.10" ]
+    uses: ./.github/workflows/pip-compile.yml
+    with:
+      os: ${{ matrix.os }}
+      python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/docs-only-checks.yml b/.github/workflows/docs-only-checks.yml
@@ -4,11 +4,13 @@ on:
   push:
     branches:
       - main
+      - develop
     paths:
       - "docs/**"
   pull_request:
     branches:
       - main
+      - develop
     paths:
       - "docs/**"
 

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -37,7 +37,6 @@ jobs:
           key: ${{inputs.os}}-python-${{inputs.python-version}}
       - name: Install dependencies
         run: |
-          pip --version
           make install-test-requirements
           make install-pre-commit
       - name: pip freeze

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -27,6 +27,7 @@ jobs:
         run: |
             make install-test-requirements
             make install-pre-commit
-            pip freeze
+      - name: pip freeze
+        run: pip freeze
       - name: Run linter
         run: make lint
diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml
@@ -24,4 +24,4 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
           timeout: 1800
           interval: 30
-          ignored: 'ci/circleci: win_e2e_tests-3.7,ci/circleci: win_pip_compile-3.9,ci/circleci: win_e2e_tests-3.9,ci/circleci: win_pip_compile-3.8,ci/circleci: lint-3.7,ci/circleci: win_pip_compile-3.7,ci/circleci: pip_compile-3.7,ci/circleci: e2e_tests-3.7,ci/circleci: win_unit_tests-3.7,ci/circleci: win_unit_tests-3.9,ci/circleci: e2e_tests-3.8,ci/circleci: win_unit_tests-3.10,ci/circleci: win_pip_compile-3.10,ci/circleci: win_unit_tests-3.8,ci/circleci: e2e_tests-3.9,ci/circleci: unit_tests-3.10,ci/circleci: unit_tests-3.8,ci/circleci: e2e_tests-3.10,ci/circleci: lint-3.8,ci/circleci: unit_tests-3.9,ci/circleci: unit_tests-3.7,ci/circleci: win_e2e_tests-3.10,ci/circleci: pip_compile-3.8,ci/circleci: pip_compile-3.10,ci/circleci: win_e2e_tests-3.8,ci/circleci: lint-3.9,ci/circleci: pip_compile-3.9,ci/circleci: lint-3.10,build_code,ci/circlecici: check-updated-files,regular'
+          ignored: 'ci/circleci: win_e2e_tests-3.7,ci/circleci: win_pip_compile-3.9,ci/circleci: win_e2e_tests-3.9,ci/circleci: win_pip_compile-3.8,ci/circleci: lint-3.7,ci/circleci: win_pip_compile-3.7,ci/circleci: pip_compile-3.7,ci/circleci: e2e_tests-3.7,ci/circleci: win_unit_tests-3.7,ci/circleci: win_unit_tests-3.9,ci/circleci: e2e_tests-3.8,ci/circleci: win_unit_tests-3.10,ci/circleci: win_pip_compile-3.10,ci/circleci: win_unit_tests-3.8,ci/circleci: e2e_tests-3.9,ci/circleci: unit_tests-3.10,ci/circleci: unit_tests-3.8,ci/circleci: e2e_tests-3.10,ci/circleci: lint-3.8,ci/circleci: unit_tests-3.9,ci/circleci: unit_tests-3.7,ci/circleci: win_e2e_tests-3.10,ci/circleci: pip_compile-3.8,ci/circleci: pip_compile-3.10,ci/circleci: win_e2e_tests-3.8,ci/circleci: lint-3.9,ci/circleci: pip_compile-3.9,ci/circleci: lint-3.10,build_code,ci/circleci: check-updated-files,regular'
diff --git a/.github/workflows/pip-compile.yml b/.github/workflows/pip-compile.yml
@@ -0,0 +1,39 @@
+name: Run pip-compile
+
+on:
+  workflow_call:
+    inputs:
+      os:
+        type: string
+      python-version:
+        type: string
+
+jobs:
+  pip-compile:
+    runs-on: ${{ inputs.os }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python ${{inputs.python-version}}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{inputs.python-version}}
+      - run: make install-pip-setuptools
+      - name: Cache python packages for Linux
+        if: inputs.os == 'ubuntu-latest'
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{inputs.os}}-python-${{inputs.python-version}}
+      - name: Cache python packages for Windows
+        if: inputs.os == 'windows-latest'
+        uses: actions/cache@v3
+        with:
+          path: ~\AppData\Local\pip\Cache
+          key: ${{inputs.os}}-python-${{inputs.python-version}}
+      - name: Install dependencies
+        run: |
+          make install-test-requirements
+          make install-pre-commit
+      - name: Run pip-compile
+        run: make pip-compile
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -39,15 +39,15 @@ jobs:
         run: pip install tables
       - name: pip freeze
         run: pip freeze
-      - name: Run unit tests
+      - name: Run unit tests sequentially
         if: inputs.os == 'ubuntu-latest' && inputs.python-version == '3.10'
         run: make test-sequential
       - name: Run unit tests
         if: inputs.os == 'ubuntu-latest' && inputs.python-version != '3.10'
         run: make test
-      - name: Run unit tests (Windows)
+      - name: Run unit tests without spark sequentially (Windows)
         if: inputs.os == 'windows-latest' && inputs.python-version == '3.10'
         run: make test-no-spark-sequential
-      - name: Run unit tests (Windows)
+      - name: Run unit tests without spark (Windows)
         if: inputs.os == 'windows-latest' && inputs.python-version != '3.10'
         run: make test-no-spark
diff --git a/RELEASE.md b/RELEASE.md
@@ -13,13 +13,16 @@
 ## Major features and improvements
 * Added dataset factories feature which uses pattern matching to reduce the number of catalog entries.
 * Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`.
+* Added `kedro catalog rank` CLI command that ranks dataset factories in the catalog by matching priority.
 
 ## Bug fixes and other changes
 * Consolidated dependencies and optional dependencies in `pyproject.toml`.
-* Pin `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813
+* Made validation of unique node outputs much faster.
+* Updated `kedro catalog list` to show datasets generated with factories.
+* Pinned `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813
 
 ## Documentation changes
-- Recommended `ruff` as the linter and remove mentions of `pylint`, `isort`, `flake8`.
+- Recommended `ruff` as the linter and removed mentions of `pylint`, `isort`, `flake8`.
 
 ## Breaking changes to the API
 

diff --git a/docs/source/_static/css/theme-overrides.css b/docs/source/_static/css/theme-overrides.css
@@ -26,3 +26,10 @@ img[alt^="mermaid-"] {
 .rst-content .important .admonition-title {
   background-color: #f0b37e;
 }
+
+/* Ensure the section title is visible when linked via a hash in the URL */
+:target:before {
+  content: "";
+  display: block;
+  height: 80px;
+}
diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md
@@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P
   * [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0)
   * [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0)
   * [`kedro catalog list`](#list-datasets-per-pipeline-per-type)
+  * [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog)
   * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file)
   * [`kedro ipython`](#notebooks)
   * [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0)
@@ -491,6 +492,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp
 kedro catalog list --pipeline=ds,de
 ```
 
+##### Rank dataset factories in the catalog
+
+```bash
+kedro catalog rank
+```
+
+The output includes a list of any [dataset factories](../data/data_catalog.md#load-multiple-datasets-with-similar-configuration-using-dataset-factories) in the catalog, ranked by the priority on which they are matched against.
+
 #### Data Catalog
 
 ##### Create a Data Catalog YAML configuration file

diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py
@@ -1,5 +1,6 @@
 """A collection of CLI commands for working with Kedro catalog."""
 from collections import defaultdict
+from itertools import chain
 
 import click
 import yaml
@@ -32,7 +33,7 @@ def catalog():
     """Commands for working with catalog."""
 
 
-# noqa: too-many-locals
+# noqa: too-many-locals,protected-access
 @catalog.command("list")
 @env_option
 @click.option(
@@ -50,11 +51,14 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
     title = "Datasets in '{}' pipeline"
     not_mentioned = "Datasets not mentioned in pipeline"
     mentioned = "Datasets mentioned in pipeline"
+    factories = "Datasets generated from factories"
 
     session = _create_session(metadata.package_name, env=env)
     context = session.load_context()
-    datasets_meta = context.catalog._data_sets  # noqa: protected-access
-    catalog_ds = set(context.catalog.list())
+
+    data_catalog = context.catalog
+    datasets_meta = data_catalog._data_sets
+    catalog_ds = set(data_catalog.list())
 
     target_pipelines = pipeline or pipelines.keys()
 
@@ -73,15 +77,30 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
         default_ds = pipeline_ds - catalog_ds
         used_ds = catalog_ds - unused_ds
 
+        # resolve any factory datasets in the pipeline
+        factory_ds_by_type = defaultdict(list)
+        for ds_name in default_ds:
+            matched_pattern = data_catalog._match_pattern(
+                data_catalog._dataset_patterns, ds_name
+            )
+            if matched_pattern:
+                ds_config = data_catalog._resolve_config(ds_name, matched_pattern)
+                factory_ds_by_type[ds_config["type"]].append(ds_name)
+
+        default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values()))
+
         unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta)
         used_by_type = _map_type_to_datasets(used_ds, datasets_meta)
 
         if default_ds:
             used_by_type["DefaultDataset"].extend(default_ds)
 
-        data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type)))
+        data = (
+            (mentioned, dict(used_by_type)),
+            (factories, dict(factory_ds_by_type)),
+            (not_mentioned, dict(unused_by_type)),
+        )
         result[title.format(pipe)] = {key: value for key, value in data if value}
-
     secho(yaml.dump(result))
 
 
@@ -174,3 +193,18 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path):
     catalog_path.parent.mkdir(exist_ok=True)
     with catalog_path.open(mode="w") as catalog_file:
         yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False)
+
+
+@catalog.command("rank")
+@env_option
+@click.pass_obj
+def rank_catalog_factories(metadata: ProjectMetadata, env):
+    """List all dataset factories in the catalog, ranked by priority by which they are matched."""
+    session = _create_session(metadata.package_name, env=env)
+    context = session.load_context()
+
+    catalog_factories = context.catalog._dataset_patterns
+    if catalog_factories:
+        click.echo(yaml.dump(list(catalog_factories.keys())))
+    else:
+        click.echo("There are no dataset factories in the catalog.")
diff --git a/kedro/pipeline/node.py b/kedro/pipeline/node.py
@@ -480,11 +480,12 @@ def _validate_inputs(self, func, inputs):
                 ) from exc
 
     def _validate_unique_outputs(self):
-        diff = Counter(self.outputs) - Counter(set(self.outputs))
+        cnt = Counter(self.outputs)
+        diff = {k for k in cnt if cnt[k] > 1}
         if diff:
             raise ValueError(
-                f"Failed to create node {self} due to duplicate"
-                f" output(s) {set(diff.keys())}.\nNode outputs must be unique."
+                f"Failed to create node {self} due to duplicate "
+                f"output(s) {diff}.\nNode outputs must be unique."
             )
 
     def _validate_inputs_dif_than_outputs(self):