diff --git a/RELEASE.md b/RELEASE.md index 1940bc28a9..33beecd840 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,6 +13,7 @@ ## Major features and improvements * Added dataset factories feature which uses pattern matching to reduce the number of catalog entries. * Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`. +* Added `kedro catalog rank` CLI command that ranks dataset factories in the catalog by matching priority. ## Bug fixes and other changes * Consolidated dependencies and optional dependencies in `pyproject.toml`. diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md index 99f6892bdc..6d65cf7321 100644 --- a/docs/source/development/commands_reference.md +++ b/docs/source/development/commands_reference.md @@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P * [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0) * [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0) * [`kedro catalog list`](#list-datasets-per-pipeline-per-type) + * [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog) * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file) * [`kedro ipython`](#notebooks) * [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0) @@ -491,6 +492,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp kedro catalog list --pipeline=ds,de ``` +##### Rank dataset factories in the catalog + +```bash +kedro catalog rank +``` + +The output includes a list of any [dataset factories](../data/data_catalog.md#load-multiple-datasets-with-similar-configuration-using-dataset-factories) in the catalog, ranked by the priority on which they are matched against. + #### Data Catalog ##### Create a Data Catalog YAML configuration file diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index bf818c7269..7323a65cc6 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -193,3 +193,18 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): catalog_path.parent.mkdir(exist_ok=True) with catalog_path.open(mode="w") as catalog_file: yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False) + + +@catalog.command("rank") +@env_option +@click.pass_obj +def rank_catalog_factories(metadata: ProjectMetadata, env): + """List all dataset factories in the catalog, ranked by priority by which they are matched.""" + session = _create_session(metadata.package_name, env=env) + context = session.load_context() + + catalog_factories = context.catalog._dataset_patterns + if catalog_factories: + click.echo(yaml.dump(list(catalog_factories.keys()))) + else: + click.echo("There are no dataset factories in the catalog.") diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 3ac10e5f52..0296fe4814 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -42,6 +42,33 @@ def fake_catalog_config(): return config +@pytest.fixture +def fake_catalog_with_overlapping_factories(): + config = { + "an_example_dataset": { + "type": "pandas.CSVDataSet", + "filepath": "dummy_filepath", + }, + "an_example_{placeholder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + "an_example_{place}_{holder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + "on_{example_placeholder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + "an_{example_placeholder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + } + return config + + @pytest.mark.usefixtures( "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" ) @@ -360,3 +387,60 @@ def test_bad_env(self, fake_project_cli, fake_metadata): assert result.exit_code assert "Unable to instantiate Kedro session" in result.output + + +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" +) +def test_rank_catalog_factories( + fake_project_cli, + fake_metadata, + mocker, + fake_load_context, + fake_catalog_with_overlapping_factories, +): + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value + mocked_context.catalog = DataCatalog.from_config( + fake_catalog_with_overlapping_factories + ) + + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) + assert not result.exit_code + + expected_patterns_sorted = [ + "an_example_{place}_{holder}", + "an_example_{placeholder}", + "an_{example_placeholder}", + "on_{example_placeholder}", + ] + + assert yaml_dump_mock.call_count == 1 + assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted + + +@pytest.mark.usefixtures( + "chdir_to_dummy_project", + "fake_load_context", +) +def test_rank_catalog_factories_with_no_factories( + fake_project_cli, fake_metadata, fake_load_context +): + mocked_context = fake_load_context.return_value + + catalog_data_sets = { + "iris_data": CSVDataSet("test.csv"), + "intermediate": MemoryDataset(), + "not_used": CSVDataSet("test2.csv"), + } + mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) + + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) + + assert not result.exit_code + expected_output = "There are no dataset factories in the catalog." + assert expected_output in result.output