Add kedro catalog rank command (#2848)

* Add command Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Add positive test Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Add negative test Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Lint Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Add changes to RELEASE.md Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Add changes to documentation Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Add detail to release note Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> * Make lint Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> --------- Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>
kedro-org · Jul 27, 2023 · 9cd8a5b · 9cd8a5b
1 parent 3bcf2b1
commit 9cd8a5b
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 0 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -13,6 +13,7 @@
 ## Major features and improvements
 * Added dataset factories feature which uses pattern matching to reduce the number of catalog entries.
 * Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`.
+* Added `kedro catalog rank` CLI command that ranks dataset factories in the catalog by matching priority.
 
 ## Bug fixes and other changes
 * Consolidated dependencies and optional dependencies in `pyproject.toml`.

diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md
@@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P
   * [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0)
   * [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0)
   * [`kedro catalog list`](#list-datasets-per-pipeline-per-type)
+  * [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog)
   * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file)
   * [`kedro ipython`](#notebooks)
   * [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0)
@@ -491,6 +492,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp
 kedro catalog list --pipeline=ds,de
 ```
 
+##### Rank dataset factories in the catalog
+
+```bash
+kedro catalog rank
+```
+
+The output includes a list of any [dataset factories](../data/data_catalog.md#load-multiple-datasets-with-similar-configuration-using-dataset-factories) in the catalog, ranked by the priority on which they are matched against.
+
 #### Data Catalog
 
 ##### Create a Data Catalog YAML configuration file

diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py
@@ -193,3 +193,18 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path):
     catalog_path.parent.mkdir(exist_ok=True)
     with catalog_path.open(mode="w") as catalog_file:
         yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False)
+
+
+@catalog.command("rank")
+@env_option
+@click.pass_obj
+def rank_catalog_factories(metadata: ProjectMetadata, env):
+    """List all dataset factories in the catalog, ranked by priority by which they are matched."""
+    session = _create_session(metadata.package_name, env=env)
+    context = session.load_context()
+
+    catalog_factories = context.catalog._dataset_patterns
+    if catalog_factories:
+        click.echo(yaml.dump(list(catalog_factories.keys())))
+    else:
+        click.echo("There are no dataset factories in the catalog.")
diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py
@@ -42,6 +42,33 @@ def fake_catalog_config():
     return config
 
 
+@pytest.fixture
+def fake_catalog_with_overlapping_factories():
+    config = {
+        "an_example_dataset": {
+            "type": "pandas.CSVDataSet",
+            "filepath": "dummy_filepath",
+        },
+        "an_example_{placeholder}": {
+            "type": "dummy_type",
+            "filepath": "dummy_filepath",
+        },
+        "an_example_{place}_{holder}": {
+            "type": "dummy_type",
+            "filepath": "dummy_filepath",
+        },
+        "on_{example_placeholder}": {
+            "type": "dummy_type",
+            "filepath": "dummy_filepath",
+        },
+        "an_{example_placeholder}": {
+            "type": "dummy_type",
+            "filepath": "dummy_filepath",
+        },
+    }
+    return config
+
+
 @pytest.mark.usefixtures(
     "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
 )
@@ -360,3 +387,60 @@ def test_bad_env(self, fake_project_cli, fake_metadata):
 
         assert result.exit_code
         assert "Unable to instantiate Kedro session" in result.output
+
+
+@pytest.mark.usefixtures(
+    "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
+)
+def test_rank_catalog_factories(
+    fake_project_cli,
+    fake_metadata,
+    mocker,
+    fake_load_context,
+    fake_catalog_with_overlapping_factories,
+):
+    yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
+    mocked_context = fake_load_context.return_value
+    mocked_context.catalog = DataCatalog.from_config(
+        fake_catalog_with_overlapping_factories
+    )
+
+    result = CliRunner().invoke(
+        fake_project_cli, ["catalog", "rank"], obj=fake_metadata
+    )
+    assert not result.exit_code
+
+    expected_patterns_sorted = [
+        "an_example_{place}_{holder}",
+        "an_example_{placeholder}",
+        "an_{example_placeholder}",
+        "on_{example_placeholder}",
+    ]
+
+    assert yaml_dump_mock.call_count == 1
+    assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted
+
+
+@pytest.mark.usefixtures(
+    "chdir_to_dummy_project",
+    "fake_load_context",
+)
+def test_rank_catalog_factories_with_no_factories(
+    fake_project_cli, fake_metadata, fake_load_context
+):
+    mocked_context = fake_load_context.return_value
+
+    catalog_data_sets = {
+        "iris_data": CSVDataSet("test.csv"),
+        "intermediate": MemoryDataset(),
+        "not_used": CSVDataSet("test2.csv"),
+    }
+    mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
+
+    result = CliRunner().invoke(
+        fake_project_cli, ["catalog", "rank"], obj=fake_metadata
+    )
+
+    assert not result.exit_code
+    expected_output = "There are no dataset factories in the catalog."
+    assert expected_output in result.output