Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/more-itertools-gte-9-and-lt-11
Browse files Browse the repository at this point in the history
  • Loading branch information
merelcht authored Jul 28, 2023
2 parents 66ff3d0 + 12d5e35 commit 7070e10
Show file tree
Hide file tree
Showing 13 changed files with 260 additions and 16 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/all-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ on:
push:
branches:
- main
- develop
paths-ignore:
- "docs/**"
pull_request:
branches:
- main
- develop
paths-ignore:
- "docs/**"

Expand Down Expand Up @@ -42,3 +44,13 @@ jobs:
with:
os: ${{ matrix.os }}
python-version: ${{ matrix.python-version }}

pip-compile:
strategy:
matrix:
os: [ ubuntu-latest, windows-latest ]
python-version: [ "3.7", "3.8", "3.9", "3.10" ]
uses: ./.github/workflows/pip-compile.yml
with:
os: ${{ matrix.os }}
python-version: ${{ matrix.python-version }}
2 changes: 2 additions & 0 deletions .github/workflows/docs-only-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ on:
push:
branches:
- main
- develop
paths:
- "docs/**"
pull_request:
branches:
- main
- develop
paths:
- "docs/**"

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ jobs:
key: ${{inputs.os}}-python-${{inputs.python-version}}
- name: Install dependencies
run: |
pip --version
make install-test-requirements
make install-pre-commit
- name: pip freeze
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
run: |
make install-test-requirements
make install-pre-commit
pip freeze
- name: pip freeze
run: pip freeze
- name: Run linter
run: make lint
2 changes: 1 addition & 1 deletion .github/workflows/merge-gatekeeper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}
timeout: 1800
interval: 30
ignored: 'ci/circleci: win_e2e_tests-3.7,ci/circleci: win_pip_compile-3.9,ci/circleci: win_e2e_tests-3.9,ci/circleci: win_pip_compile-3.8,ci/circleci: lint-3.7,ci/circleci: win_pip_compile-3.7,ci/circleci: pip_compile-3.7,ci/circleci: e2e_tests-3.7,ci/circleci: win_unit_tests-3.7,ci/circleci: win_unit_tests-3.9,ci/circleci: e2e_tests-3.8,ci/circleci: win_unit_tests-3.10,ci/circleci: win_pip_compile-3.10,ci/circleci: win_unit_tests-3.8,ci/circleci: e2e_tests-3.9,ci/circleci: unit_tests-3.10,ci/circleci: unit_tests-3.8,ci/circleci: e2e_tests-3.10,ci/circleci: lint-3.8,ci/circleci: unit_tests-3.9,ci/circleci: unit_tests-3.7,ci/circleci: win_e2e_tests-3.10,ci/circleci: pip_compile-3.8,ci/circleci: pip_compile-3.10,ci/circleci: win_e2e_tests-3.8,ci/circleci: lint-3.9,ci/circleci: pip_compile-3.9,ci/circleci: lint-3.10,build_code,ci/circlecici: check-updated-files,regular'
ignored: 'ci/circleci: win_e2e_tests-3.7,ci/circleci: win_pip_compile-3.9,ci/circleci: win_e2e_tests-3.9,ci/circleci: win_pip_compile-3.8,ci/circleci: lint-3.7,ci/circleci: win_pip_compile-3.7,ci/circleci: pip_compile-3.7,ci/circleci: e2e_tests-3.7,ci/circleci: win_unit_tests-3.7,ci/circleci: win_unit_tests-3.9,ci/circleci: e2e_tests-3.8,ci/circleci: win_unit_tests-3.10,ci/circleci: win_pip_compile-3.10,ci/circleci: win_unit_tests-3.8,ci/circleci: e2e_tests-3.9,ci/circleci: unit_tests-3.10,ci/circleci: unit_tests-3.8,ci/circleci: e2e_tests-3.10,ci/circleci: lint-3.8,ci/circleci: unit_tests-3.9,ci/circleci: unit_tests-3.7,ci/circleci: win_e2e_tests-3.10,ci/circleci: pip_compile-3.8,ci/circleci: pip_compile-3.10,ci/circleci: win_e2e_tests-3.8,ci/circleci: lint-3.9,ci/circleci: pip_compile-3.9,ci/circleci: lint-3.10,build_code,ci/circleci: check-updated-files,regular'
39 changes: 39 additions & 0 deletions .github/workflows/pip-compile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Run pip-compile

on:
workflow_call:
inputs:
os:
type: string
python-version:
type: string

jobs:
pip-compile:
runs-on: ${{ inputs.os }}
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python ${{inputs.python-version}}
uses: actions/setup-python@v3
with:
python-version: ${{inputs.python-version}}
- run: make install-pip-setuptools
- name: Cache python packages for Linux
if: inputs.os == 'ubuntu-latest'
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{inputs.os}}-python-${{inputs.python-version}}
- name: Cache python packages for Windows
if: inputs.os == 'windows-latest'
uses: actions/cache@v3
with:
path: ~\AppData\Local\pip\Cache
key: ${{inputs.os}}-python-${{inputs.python-version}}
- name: Install dependencies
run: |
make install-test-requirements
make install-pre-commit
- name: Run pip-compile
run: make pip-compile
6 changes: 3 additions & 3 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ jobs:
run: pip install tables
- name: pip freeze
run: pip freeze
- name: Run unit tests
- name: Run unit tests sequentially
if: inputs.os == 'ubuntu-latest' && inputs.python-version == '3.10'
run: make test-sequential
- name: Run unit tests
if: inputs.os == 'ubuntu-latest' && inputs.python-version != '3.10'
run: make test
- name: Run unit tests (Windows)
- name: Run unit tests without spark sequentially (Windows)
if: inputs.os == 'windows-latest' && inputs.python-version == '3.10'
run: make test-no-spark-sequential
- name: Run unit tests (Windows)
- name: Run unit tests without spark (Windows)
if: inputs.os == 'windows-latest' && inputs.python-version != '3.10'
run: make test-no-spark
7 changes: 5 additions & 2 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
## Major features and improvements
* Added dataset factories feature which uses pattern matching to reduce the number of catalog entries.
* Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`.
* Added `kedro catalog rank` CLI command that ranks dataset factories in the catalog by matching priority.

## Bug fixes and other changes
* Consolidated dependencies and optional dependencies in `pyproject.toml`.
* Pin `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813
* Made validation of unique node outputs much faster.
* Updated `kedro catalog list` to show datasets generated with factories.
* Pinned `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813

## Documentation changes
- Recommended `ruff` as the linter and remove mentions of `pylint`, `isort`, `flake8`.
- Recommended `ruff` as the linter and removed mentions of `pylint`, `isort`, `flake8`.

## Breaking changes to the API

Expand Down
7 changes: 7 additions & 0 deletions docs/source/_static/css/theme-overrides.css
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,10 @@ img[alt^="mermaid-"] {
.rst-content .important .admonition-title {
background-color: #f0b37e;
}

/* Ensure the section title is visible when linked via a hash in the URL */
:target:before {
content: "";
display: block;
height: 80px;
}
9 changes: 9 additions & 0 deletions docs/source/development/commands_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P
* [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0)
* [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0)
* [`kedro catalog list`](#list-datasets-per-pipeline-per-type)
* [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog)
* [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file)
* [`kedro ipython`](#notebooks)
* [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0)
Expand Down Expand Up @@ -491,6 +492,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp
kedro catalog list --pipeline=ds,de
```

##### Rank dataset factories in the catalog

```bash
kedro catalog rank
```

The output includes a list of any [dataset factories](../data/data_catalog.md#load-multiple-datasets-with-similar-configuration-using-dataset-factories) in the catalog, ranked by the priority on which they are matched against.

#### Data Catalog

##### Create a Data Catalog YAML configuration file
Expand Down
44 changes: 39 additions & 5 deletions kedro/framework/cli/catalog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""A collection of CLI commands for working with Kedro catalog."""
from collections import defaultdict
from itertools import chain

import click
import yaml
Expand Down Expand Up @@ -32,7 +33,7 @@ def catalog():
"""Commands for working with catalog."""


# noqa: too-many-locals
# noqa: too-many-locals,protected-access
@catalog.command("list")
@env_option
@click.option(
Expand All @@ -50,11 +51,14 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
title = "Datasets in '{}' pipeline"
not_mentioned = "Datasets not mentioned in pipeline"
mentioned = "Datasets mentioned in pipeline"
factories = "Datasets generated from factories"

session = _create_session(metadata.package_name, env=env)
context = session.load_context()
datasets_meta = context.catalog._data_sets # noqa: protected-access
catalog_ds = set(context.catalog.list())

data_catalog = context.catalog
datasets_meta = data_catalog._data_sets
catalog_ds = set(data_catalog.list())

target_pipelines = pipeline or pipelines.keys()

Expand All @@ -73,15 +77,30 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
default_ds = pipeline_ds - catalog_ds
used_ds = catalog_ds - unused_ds

# resolve any factory datasets in the pipeline
factory_ds_by_type = defaultdict(list)
for ds_name in default_ds:
matched_pattern = data_catalog._match_pattern(
data_catalog._dataset_patterns, ds_name
)
if matched_pattern:
ds_config = data_catalog._resolve_config(ds_name, matched_pattern)
factory_ds_by_type[ds_config["type"]].append(ds_name)

default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values()))

unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta)
used_by_type = _map_type_to_datasets(used_ds, datasets_meta)

if default_ds:
used_by_type["DefaultDataset"].extend(default_ds)

data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type)))
data = (
(mentioned, dict(used_by_type)),
(factories, dict(factory_ds_by_type)),
(not_mentioned, dict(unused_by_type)),
)
result[title.format(pipe)] = {key: value for key, value in data if value}

secho(yaml.dump(result))


Expand Down Expand Up @@ -174,3 +193,18 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path):
catalog_path.parent.mkdir(exist_ok=True)
with catalog_path.open(mode="w") as catalog_file:
yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False)


@catalog.command("rank")
@env_option
@click.pass_obj
def rank_catalog_factories(metadata: ProjectMetadata, env):
"""List all dataset factories in the catalog, ranked by priority by which they are matched."""
session = _create_session(metadata.package_name, env=env)
context = session.load_context()

catalog_factories = context.catalog._dataset_patterns
if catalog_factories:
click.echo(yaml.dump(list(catalog_factories.keys())))
else:
click.echo("There are no dataset factories in the catalog.")
7 changes: 4 additions & 3 deletions kedro/pipeline/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,11 +480,12 @@ def _validate_inputs(self, func, inputs):
) from exc

def _validate_unique_outputs(self):
diff = Counter(self.outputs) - Counter(set(self.outputs))
cnt = Counter(self.outputs)
diff = {k for k in cnt if cnt[k] > 1}
if diff:
raise ValueError(
f"Failed to create node {self} due to duplicate"
f" output(s) {set(diff.keys())}.\nNode outputs must be unique."
f"Failed to create node {self} due to duplicate "
f"output(s) {diff}.\nNode outputs must be unique."
)

def _validate_inputs_dif_than_outputs(self):
Expand Down
Loading

0 comments on commit 7070e10

Please sign in to comment.