kedro-org · AhdraMeraliQB · Aug 18, 2023 · Aug 3, 2023 · Aug 3, 2023 · Aug 3, 2023
@@ -13,6 +13,7 @@
 ## Major features and improvements
 * Allowed registering of custom resolvers to `OmegaConfigLoader` through `CONFIG_LOADER_ARGS`.
 * Added support for Python 3.11. This includes tackling challenges like dependency pinning and test adjustments to ensure a smooth experience. Detailed migration tips are provided below for further context.
+* Added `kedro catalog resolve` CLI command that resolves dataset factories in the catalog with any explicit entries in the project pipeline.
 
 ## Bug fixes and other changes
 * Updated `kedro pipeline create` and `kedro catalog create` to use new `/conf` file structure.

@@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P
   * [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0)
   * [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0)
   * [`kedro catalog list`](#list-datasets-per-pipeline-per-type)
+  * [`kedro catalog resolve`](#resolve-dataset-factories-in-the-catalog)
   * [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog)
   * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file)
   * [`kedro ipython`](#notebooks)
@@ -492,6 +493,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp
 kedro catalog list --pipeline=ds,de
 ```
 
+##### Resolve dataset factories in the catalog
+
+```bash
+kedro catalog resolve
+```
+
+This command resolves dataset factories in the catalog file with any explicit entries in the pipeline. The output includes datasets explicitly mentioned in your catalog files and any datasets mentioned in the project's pipelines that resolve some dataset factory.
+
 ##### Rank dataset factories in the catalog
 
 ```bash

@@ -207,3 +207,51 @@ def rank_catalog_factories(metadata: ProjectMetadata, env):
         click.echo(yaml.dump(list(catalog_factories.keys())))
     else:
         click.echo("There are no dataset factories in the catalog.")
+
+
+@catalog.command("resolve")
+@env_option
+@click.pass_obj
+def resolve_patterns(metadata: ProjectMetadata, env):
+    """Resolve catalog factories against pipeline datasets"""
+
+    session = _create_session(metadata.package_name, env=env)
+    context = session.load_context()
+
+    data_catalog = context.catalog
+    catalog_config = context.config_loader["catalog"]
+
+    catalog_config = {
+        ds_name: ds_config
+        for ds_name, ds_config in catalog_config.items()
+        if not data_catalog._is_pattern(ds_name)
+    }
+
+    target_pipelines = pipelines.keys()
+    datasets = set()
+
+    for pipe in target_pipelines:
+        pl_obj = pipelines.get(pipe)
+        if pl_obj:
+            datasets.update(pl_obj.data_sets())
+
+    for ds_name in datasets:
+        is_param = ds_name.startswith("params:") or ds_name == "parameters"
+        if ds_name in catalog_config or is_param:
+            continue
+
+        matched_pattern = data_catalog._match_pattern(
+            data_catalog._dataset_patterns, ds_name
+        )
+        if matched_pattern:
+            ds_config = data_catalog._resolve_config(ds_name, matched_pattern)
+            ds_config["filepath"] = _trim_filepath(
+                str(context.project_path) + "/", ds_config["filepath"]
+            )
+            catalog_config[ds_name] = ds_config
+
+    secho(yaml.dump(catalog_config))
+
+
+def _trim_filepath(project_path: str, file_path: str):
+    return file_path.replace(project_path, "", 1)
@@ -67,6 +67,23 @@ def fake_catalog_with_overlapping_factories():
     return config
 
 
+@pytest.fixture
+def fake_catalog_config_with_overwrite():
+    config = {
+        "parquet_{factory_pattern}": {
+            "type": "pandas.ParquetDataSet",
+            "filepath": "test.pq",
+        },
+        "csv_{factory_pattern}": {"type": "pandas.CSVDataSet", "filepath": "test.csv"},
+        "explicit_ds": {"type": "pandas.CSVDataSet", "filepath": "test.csv"},
+        "{factory_pattern}_ds": {
+            "type": "pandas.ParquetDataSet",
+            "filepath": "test.pq",
+        },
+    }
+    return config
+
+
 @pytest.mark.usefixtures(
     "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
 )
@@ -441,3 +458,121 @@ def test_rank_catalog_factories_with_no_factories(
     assert not result.exit_code
     expected_output = "There are no dataset factories in the catalog."
     assert expected_output in result.output
+
+
+@pytest.mark.usefixtures(
+    "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
+)
+def test_catalog_resolve(
+    fake_project_cli,
+    fake_metadata,
+    fake_load_context,
+    mocker,
+    mock_pipelines,
+    fake_catalog_config,
+):
+    """Test that datasets factories are correctly resolved to the explicit datasets in the pipeline."""
+    yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
+    mocked_context = fake_load_context.return_value
+    mocked_context.catalog = DataCatalog.from_config(fake_catalog_config)
+
+    placeholder_ds = mocked_context.catalog._data_sets.keys()
+    explicit_ds = {"csv_example", "parquet_example"}
+
+    mocker.patch.object(
+        mock_pipelines[PIPELINE_NAME],
+        "data_sets",
+        return_value=explicit_ds,
+    )
+
+    result = CliRunner().invoke(
+        fake_project_cli, ["catalog", "resolve"], obj=fake_metadata
+    )
+
+    assert not result.exit_code
+    assert yaml_dump_mock.call_count == 1
+
+    output = yaml_dump_mock.call_args[0][0]
+
+    for ds in placeholder_ds:
+        assert ds not in output
+
+    for ds in explicit_ds:
+        assert ds in output
+
+
+@pytest.mark.usefixtures(
+    "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
+)
+def test_no_overwrite(
+    fake_project_cli,
+    fake_metadata,
+    fake_load_context,
+    mocker,
+    mock_pipelines,
+    fake_catalog_config_with_overwrite,
+):
+    """Test that explicit catalog entries are not overitten by factory config."""
+    yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
+    mocked_context = fake_load_context.return_value
+
+    mocked_context.config_loader = {"catalog": fake_catalog_config_with_overwrite}
+    mocked_context.catalog = DataCatalog.from_config(fake_catalog_config_with_overwrite)
+
+    mocker.patch.object(
+        mock_pipelines[PIPELINE_NAME],
+        "data_sets",
+        return_value=mocked_context.catalog._data_sets.keys()
+        | {"csv_example", "parquet_example"},
+    )
+
+    result = CliRunner().invoke(
+        fake_project_cli, ["catalog", "resolve"], obj=fake_metadata
+    )
+
+    assert not result.exit_code
+    assert yaml_dump_mock.call_count == 1
+
+    assert (
+        yaml_dump_mock.call_args[0][0]["explicit_ds"]
+        == fake_catalog_config_with_overwrite["explicit_ds"]
+    )
+
+
+@pytest.mark.usefixtures(
+    "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
+)
+def test_no_param_datasets_in_resolve(
+    fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines
+):
+
+    yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
+    mocked_context = fake_load_context.return_value
+    catalog_data_sets = {
+        "iris_data": CSVDataSet("test.csv"),
+        "intermediate": MemoryDataset(),
+        "parameters": MemoryDataset(),
+        "params:data_ratio": MemoryDataset(),
+    }
+
+    mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
+    mocker.patch.object(
+        mock_pipelines[PIPELINE_NAME],
+        "data_sets",
+        return_value=catalog_data_sets.keys(),
+    )
+
+    result = CliRunner().invoke(
+        fake_project_cli,
+        ["catalog", "resolve"],
+        obj=fake_metadata,
+    )
+
+    assert not result.exit_code
+    assert yaml_dump_mock.call_count == 1
+
+    # 'parameters' and 'params:data_ratio' should not appear in the output
+    output = yaml_dump_mock.call_args[0][0]
+
+    assert "parameters" not in output.keys()
+    assert "params:data_ratio" not in output.keys()