Skip to content

Commit

Permalink
Add kedro catalog rank command (#2848)
Browse files Browse the repository at this point in the history
* Add command

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Add positive test

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Add negative test

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Lint

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Add changes to RELEASE.md

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Add changes to documentation

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Add detail to release note

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Make lint

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

---------

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>
  • Loading branch information
AhdraMeraliQB authored Jul 27, 2023
1 parent 3bcf2b1 commit 9cd8a5b
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 0 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
## Major features and improvements
* Added dataset factories feature which uses pattern matching to reduce the number of catalog entries.
* Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`.
* Added `kedro catalog rank` CLI command that ranks dataset factories in the catalog by matching priority.

## Bug fixes and other changes
* Consolidated dependencies and optional dependencies in `pyproject.toml`.
Expand Down
9 changes: 9 additions & 0 deletions docs/source/development/commands_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P
* [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0)
* [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0)
* [`kedro catalog list`](#list-datasets-per-pipeline-per-type)
* [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog)
* [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file)
* [`kedro ipython`](#notebooks)
* [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0)
Expand Down Expand Up @@ -491,6 +492,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp
kedro catalog list --pipeline=ds,de
```

##### Rank dataset factories in the catalog

```bash
kedro catalog rank
```

The output includes a list of any [dataset factories](../data/data_catalog.md#load-multiple-datasets-with-similar-configuration-using-dataset-factories) in the catalog, ranked by the priority on which they are matched against.

#### Data Catalog

##### Create a Data Catalog YAML configuration file
Expand Down
15 changes: 15 additions & 0 deletions kedro/framework/cli/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,18 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path):
catalog_path.parent.mkdir(exist_ok=True)
with catalog_path.open(mode="w") as catalog_file:
yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False)


@catalog.command("rank")
@env_option
@click.pass_obj
def rank_catalog_factories(metadata: ProjectMetadata, env):
"""List all dataset factories in the catalog, ranked by priority by which they are matched."""
session = _create_session(metadata.package_name, env=env)
context = session.load_context()

catalog_factories = context.catalog._dataset_patterns
if catalog_factories:
click.echo(yaml.dump(list(catalog_factories.keys())))
else:
click.echo("There are no dataset factories in the catalog.")
84 changes: 84 additions & 0 deletions tests/framework/cli/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,33 @@ def fake_catalog_config():
return config


@pytest.fixture
def fake_catalog_with_overlapping_factories():
config = {
"an_example_dataset": {
"type": "pandas.CSVDataSet",
"filepath": "dummy_filepath",
},
"an_example_{placeholder}": {
"type": "dummy_type",
"filepath": "dummy_filepath",
},
"an_example_{place}_{holder}": {
"type": "dummy_type",
"filepath": "dummy_filepath",
},
"on_{example_placeholder}": {
"type": "dummy_type",
"filepath": "dummy_filepath",
},
"an_{example_placeholder}": {
"type": "dummy_type",
"filepath": "dummy_filepath",
},
}
return config


@pytest.mark.usefixtures(
"chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
)
Expand Down Expand Up @@ -360,3 +387,60 @@ def test_bad_env(self, fake_project_cli, fake_metadata):

assert result.exit_code
assert "Unable to instantiate Kedro session" in result.output


@pytest.mark.usefixtures(
"chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
)
def test_rank_catalog_factories(
fake_project_cli,
fake_metadata,
mocker,
fake_load_context,
fake_catalog_with_overlapping_factories,
):
yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
mocked_context = fake_load_context.return_value
mocked_context.catalog = DataCatalog.from_config(
fake_catalog_with_overlapping_factories
)

result = CliRunner().invoke(
fake_project_cli, ["catalog", "rank"], obj=fake_metadata
)
assert not result.exit_code

expected_patterns_sorted = [
"an_example_{place}_{holder}",
"an_example_{placeholder}",
"an_{example_placeholder}",
"on_{example_placeholder}",
]

assert yaml_dump_mock.call_count == 1
assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted


@pytest.mark.usefixtures(
"chdir_to_dummy_project",
"fake_load_context",
)
def test_rank_catalog_factories_with_no_factories(
fake_project_cli, fake_metadata, fake_load_context
):
mocked_context = fake_load_context.return_value

catalog_data_sets = {
"iris_data": CSVDataSet("test.csv"),
"intermediate": MemoryDataset(),
"not_used": CSVDataSet("test2.csv"),
}
mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)

result = CliRunner().invoke(
fake_project_cli, ["catalog", "rank"], obj=fake_metadata
)

assert not result.exit_code
expected_output = "There are no dataset factories in the catalog."
assert expected_output in result.output

0 comments on commit 9cd8a5b

Please sign in to comment.