Merge branch 'main' into 3741-improve-docs-seo-manually-create-sitemap

kedro-org · Oct 9, 2024 · 0d9c58f · 0d9c58f
2 parents c4014f7 + 6adbed9
commit 0d9c58f
Show file tree

Hide file tree

Showing 81 changed files with 2,098 additions and 866 deletions.
diff --git a/.github/styles/Kedro/ignore.txt b/.github/styles/Kedro/ignore.txt
@@ -44,3 +44,5 @@ transcoding
 transcode
 Claypot
 ethanknights
+Aneira
+Printify
diff --git a/.github/workflows/all-checks.yml b/.github/workflows/all-checks.yml
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         os: [ windows-latest, ubuntu-latest ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
     uses: ./.github/workflows/unit-tests.yml
     with:
       os: ${{ matrix.os }}
@@ -36,7 +36,7 @@ jobs:
     strategy:
       matrix:
         os: [ windows-latest, ubuntu-latest ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
     uses: ./.github/workflows/e2e-tests.yml
     with:
       os: ${{ matrix.os }}
@@ -59,7 +59,7 @@ jobs:
     strategy:
       matrix:
         os: [ windows-latest, ubuntu-latest ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
     uses: ./.github/workflows/pip-compile.yml
     with:
       os: ${{ matrix.os }}

diff --git a/.github/workflows/benchmark-performance.yml b/.github/workflows/benchmark-performance.yml
@@ -0,0 +1,59 @@
+name: ASV Benchmark
+
+on:
+  push:
+    branches:
+      - main  # Run benchmarks on every commit to the main branch
+  workflow_dispatch:
+
+
+jobs:
+
+  benchmark:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          path: "kedro"
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install asv  # Install ASV
+
+      - name: Run ASV benchmarks
+        run: |
+          cd kedro
+          asv machine --machine=github-actions
+          asv run -v --machine=github-actions
+
+      - name: Set git email and name
+        run: |
+          git config --global user.email "kedro@kedro.com"
+          git config --global user.name "Kedro"
+
+      - name: Checkout target repository
+        uses: actions/checkout@v4
+        with:
+          repository: kedro-org/kedro-benchmark-results
+          token: ${{ secrets.GH_TAGGING_TOKEN }}
+          ref: 'main'
+          path: "kedro-benchmark-results"
+
+      - name: Copy files to target repository
+        run: |
+          cp -r /home/runner/work/kedro/kedro/kedro/.asv /home/runner/work/kedro/kedro/kedro-benchmark-results/
+
+      - name: Commit and Push changes to kedro-org/kedro-benchmark-results
+        run: |
+          cd kedro-benchmark-results
+          git add .
+          git commit -m "Add results"
+          git push
diff --git a/.github/workflows/docs-only-checks.yml b/.github/workflows/docs-only-checks.yml
@@ -21,7 +21,7 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-latest ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
     uses: ./.github/workflows/lint.yml
     with:
       os: ${{ matrix.os }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 
-default_stages: [commit, manual]
+default_stages: [pre-commit, manual]
 
 repos:
     - repo: https://github.com/astral-sh/ruff-pre-commit

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
   </picture>
 </p>
 
-[![Python version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue.svg)](https://pypi.org/project/kedro/)
+[![Python version](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue.svg)](https://pypi.org/project/kedro/)
 [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/)
 [![Conda version](https://img.shields.io/conda/vn/conda-forge/kedro.svg)](https://anaconda.org/conda-forge/kedro)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/kedro-org/kedro/blob/main/LICENSE.md)

diff --git a/RELEASE.md b/RELEASE.md
@@ -1,12 +1,29 @@
 # Upcoming Release
 
 ## Major features and improvements
+* Dropped Python 3.8 support.
+* Implemented `KedroDataCatalog` repeating `DataCatalog` functionality with a few API enhancements:
+  * Removed `_FrozenDatasets` and access datasets as properties;
+  * Added get dataset by name feature;
+  * `add_feed_dict()` was simplified to only add raw data;
+  * Datasets' initialisation was moved out from `from_config()` method to the constructor.
+* Moved development requirements from `requirements.txt` to the dedicated section in `pyproject.toml` for project template.
+* Implemented `Protocol` abstraction for the current `DataCatalog` and adding new catalog implementations.
+* Refactored `kedro run` and `kedro catalog` commands.
+* Moved pattern resolution logic from `DataCatalog` to a separate component - `CatalogConfigResolver`. Updated `DataCatalog` to use `CatalogConfigResolver` internally.
 * Made packaged Kedro projects return `session.run()` output to be used when running it in the interactive environment.
 * Enhanced `OmegaConfigLoader` configuration validation to detect duplicate keys at all parameter levels, ensuring comprehensive nested key checking.
+
+**Note:** ``KedroDataCatalog`` is an experimental feature and is under active development. Therefore, it is possible we'll introduce breaking changes to this class, so be mindful of that if you decide to use it already. Let us know if you have any feedback about the ``KedroDataCatalog`` or ideas for new features.
+
 ## Bug fixes and other changes
 * Fixed bug where using dataset factories breaks with `ThreadRunner`.
+* Fixed a bug where `SharedMemoryDataset.exists` would not call the underlying `MemoryDataset`.
+* Fixed template projects example tests.
+* Made credentials loading consistent between `KedroContext._get_catalog()` and `resolve_patterns` so that both use `_get_config_credentials()`
 
 ## Breaking changes to the API
+* Removed `ShelveStore` to address a security vulnerability.
 
 ## Documentation changes
 * Fix logo on PyPI page.
@@ -15,6 +32,10 @@
 ## Community contributions
 * [Puneet](https://github.com/puneeter)
 * [ethanknights](https://github.com/ethanknights)
+* [Manezki](https://github.com/Manezki)
+* [MigQ2](https://github.com/MigQ2)
+* [Felix Scherz](https://github.com/felixscherz)
+* [Yu-Sheng Li](https://github.com/kevin1kevin1k)
 
 # Release 0.19.8
 

diff --git a/asv.conf.json b/asv.conf.json
@@ -0,0 +1,12 @@
+{
+    "version": 1,
+    "project": "Kedro",
+    "project_url": "https://kedro.org/",
+    "repo": ".",
+    "install_command": ["pip install -e ."],
+    "branches": ["main"],
+    "environment_type": "virtualenv",
+    "show_commit_url": "http://github.com/kedro-org/kedro/commit/",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html"
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/benchmark_dummy.py b/benchmarks/benchmark_dummy.py
@@ -0,0 +1,16 @@
+# Write the benchmarking functions here.
+# See "Writing benchmarks" in the asv docs for more information.
+
+
+class TimeSuite:
+    """
+    A dummy benchmark suite to test with asv framework.
+    """
+    def setup(self):
+        self.d = {}
+        for x in range(500):
+            self.d[x] = None
+
+    def time_keys(self):
+        for key in self.d.keys():
+            pass
diff --git a/docs/source/api/kedro.framework.session.shelvestore.ShelveStore.rst b/docs/source/api/kedro.framework.session.shelvestore.ShelveStore.rst
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -70,7 +70,7 @@
 intersphinx_mapping = {
     "kedro-viz": ("https://docs.kedro.org/projects/kedro-viz/en/v6.6.1/", None),
     "kedro-datasets": ("https://docs.kedro.org/projects/kedro-datasets/en/kedro-datasets-2.0.0/", None),
-    "cpython": ("https://docs.python.org/3.8/", None),
+    "cpython": ("https://docs.python.org/3.9/", None),
     "ipython": ("https://ipython.readthedocs.io/en/8.21.0/", None),
     "mlflow": ("https://www.mlflow.org/docs/2.12.1/", None),
     "kedro-mlflow": ("https://kedro-mlflow.readthedocs.io/en/0.12.2/", None),
@@ -127,11 +127,14 @@
         "typing.Type",
         "typing.Set",
         "kedro.config.config.ConfigLoader",
+        "kedro.io.catalog_config_resolver.CatalogConfigResolver",
         "kedro.io.core.AbstractDataset",
         "kedro.io.core.AbstractVersionedDataset",
+        "kedro.io.core.CatalogProtocol",
         "kedro.io.core.DatasetError",
         "kedro.io.core.Version",
         "kedro.io.data_catalog.DataCatalog",
+        "kedro.io.kedro_data_catalog.KedroDataCatalog",
         "kedro.io.memory_dataset.MemoryDataset",
         "kedro.io.partitioned_dataset.PartitionedDataset",
         "kedro.pipeline.pipeline.Pipeline",
@@ -168,6 +171,9 @@
         "D[k] if k in D, else d.  d defaults to None.",
         "None.  Update D from mapping/iterable E and F.",
         "Patterns",
+        "CatalogConfigResolver",
+        "CatalogProtocol",
+        "KedroDataCatalog",
     ),
     "py:data": (
         "typing.Any",

diff --git a/docs/source/contribution/technical_steering_committee.md b/docs/source/contribution/technical_steering_committee.md
@@ -61,10 +61,10 @@ We look for commitment markers who can do the following:
 | [Huong Nguyen](https://github.com/Huongg)                | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Ivan Danov](https://github.com/idanov)                  | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Jitendra Gundaniya](https://github.com/jitu5)           | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
-| [Joel Schwarzmann](https://github.com/datajoely)         | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
+| [Joel Schwarzmann](https://github.com/datajoely)         | [Aneira Health](https://www.aneira.health)                                              |
 | [Juan Luis Cano](https://github.com/astrojuanlu)         | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Laura Couto](https://github.com/lrcouto)                | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
-| [Marcin Zabłocki](https://github.com/marrrcin)           | [Printify, Inc.](https://printify.com/)  |
+| [Marcin Zabłocki](https://github.com/marrrcin)           | [Printify, Inc.](https://printify.com/)                                                 |
 | [Merel Theisen](https://github.com/merelcht)             | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Nok Lam Chan](https://github.com/noklam)                | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Rashida Kanchwala](https://github.com/rashidakanchwala) | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |

diff --git a/docs/source/data/how_to_create_a_custom_dataset.md b/docs/source/data/how_to_create_a_custom_dataset.md
@@ -4,7 +4,7 @@
 
 ## AbstractDataset
 
-If you are a contributor and would like to submit a new dataset, you must extend the {py:class}`~kedro.io.AbstractDataset` interface or {py:class}`~kedro.io.AbstractVersionedDataset` interface if you plan to support versioning. It requires subclasses to override the `_load` and `_save` and provides `load` and `save` methods that enrich the corresponding private methods with uniform error handling. It also requires subclasses to override `_describe`, which is used in logging the internal information about the instances of your custom `AbstractDataset` implementation.
+If you are a contributor and would like to submit a new dataset, you must extend the {py:class}`~kedro.io.AbstractDataset` interface or {py:class}`~kedro.io.AbstractVersionedDataset` interface if you plan to support versioning. It requires subclasses to implement the `load` and `save` methods while providing wrappers that enrich the corresponding methods with uniform error handling. It also requires subclasses to override `_describe`, which is used in logging the internal information about the instances of your custom `AbstractDataset` implementation.
 
 
 ## Scenario
@@ -31,8 +31,8 @@ Consult the [Pillow documentation](https://pillow.readthedocs.io/en/stable/insta
 
 At the minimum, a valid Kedro dataset needs to subclass the base {py:class}`~kedro.io.AbstractDataset` and provide an implementation for the following abstract methods:
 
-* `_load`
-* `_save`
+* `load`
+* `save`
 * `_describe`
 
 `AbstractDataset` is generically typed with an input data type for saving data, and an output data type for loading data.
@@ -70,15 +70,15 @@ class ImageDataset(AbstractDataset[np.ndarray, np.ndarray]):
         """
         self._filepath = filepath
 
-    def _load(self) -> np.ndarray:
+    def load(self) -> np.ndarray:
         """Loads data from the image file.
 
         Returns:
             Data from the image file as a numpy array.
         """
         ...
 
-    def _save(self, data: np.ndarray) -> None:
+    def save(self, data: np.ndarray) -> None:
         """Saves image data to the specified filepath"""
         ...
 
@@ -96,11 +96,11 @@ src/kedro_pokemon/datasets
 └── image_dataset.py
 ```
 
-## Implement the `_load` method with `fsspec`
+## Implement the `load` method with `fsspec`
 
 Many of the built-in Kedro datasets rely on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) as a consistent interface to different data sources, as described earlier in the section about the [Data Catalog](../data/data_catalog.md#dataset-filepath). In this example, it's particularly convenient to use `fsspec` in conjunction with `Pillow` to read image data, since it allows the dataset to work flexibly with different image locations and formats.
 
-Here is the implementation of the `_load` method using `fsspec` and `Pillow` to read the data of a single image into a `numpy` array:
+Here is the implementation of the `load` method using `fsspec` and `Pillow` to read the data of a single image into a `numpy` array:
 
 <details>
 <summary><b>Click to expand</b></summary>
@@ -130,7 +130,7 @@ class ImageDataset(AbstractDataset[np.ndarray, np.ndarray]):
         self._filepath = PurePosixPath(path)
         self._fs = fsspec.filesystem(self._protocol)
 
-    def _load(self) -> np.ndarray:
+    def load(self) -> np.ndarray:
         """Loads data from the image file.
 
         Returns:
@@ -168,14 +168,14 @@ In [2]: from PIL import Image
 In [3]: Image.fromarray(image).show()
 ```
 
-## Implement the `_save` method with `fsspec`
+## Implement the `save` method with `fsspec`
 
 Similarly, we can implement the `_save` method as follows:
 
 
 ```python
 class ImageDataset(AbstractDataset[np.ndarray, np.ndarray]):
-    def _save(self, data: np.ndarray) -> None:
+    def save(self, data: np.ndarray) -> None:
         """Saves image data to the specified filepath."""
         # using get_filepath_str ensures that the protocol and path are appended correctly for different filesystems
         save_path = get_filepath_str(self._filepath, self._protocol)
@@ -243,7 +243,7 @@ class ImageDataset(AbstractDataset[np.ndarray, np.ndarray]):
         self._filepath = PurePosixPath(path)
         self._fs = fsspec.filesystem(self._protocol)
 
-    def _load(self) -> np.ndarray:
+    def load(self) -> np.ndarray:
         """Loads data from the image file.
 
         Returns:
@@ -254,7 +254,7 @@ class ImageDataset(AbstractDataset[np.ndarray, np.ndarray]):
             image = Image.open(f).convert("RGBA")
             return np.asarray(image)
 
-    def _save(self, data: np.ndarray) -> None:
+    def save(self, data: np.ndarray) -> None:
         """Saves image data to the specified filepath."""
         save_path = get_filepath_str(self._filepath, self._protocol)
         with self._fs.open(save_path, mode="wb") as f:
@@ -312,7 +312,7 @@ To add versioning support to the new dataset we need to extend the
  {py:class}`~kedro.io.AbstractVersionedDataset` to:
 
 * Accept a `version` keyword argument as part of the constructor
-* Adapt the `_load` and `_save` method to use the versioned data path obtained from `_get_load_path` and `_get_save_path` respectively
+* Adapt the `load` and `save` method to use the versioned data path obtained from `_get_load_path` and `_get_save_path` respectively
 
 The following amends the full implementation of our basic `ImageDataset`. It now loads and saves data to and from a versioned subfolder (`data/01_raw/pokemon-images-and-types/images/images/pikachu.png/<version>/pikachu.png` with `version` being a datetime-formatted string `YYYY-MM-DDThh.mm.ss.sssZ` by default):
 
@@ -359,7 +359,7 @@ class ImageDataset(AbstractVersionedDataset[np.ndarray, np.ndarray]):
             glob_function=self._fs.glob,
         )
 
-    def _load(self) -> np.ndarray:
+    def load(self) -> np.ndarray:
         """Loads data from the image file.
 
         Returns:
@@ -370,7 +370,7 @@ class ImageDataset(AbstractVersionedDataset[np.ndarray, np.ndarray]):
             image = Image.open(f).convert("RGBA")
             return np.asarray(image)
 
-    def _save(self, data: np.ndarray) -> None:
+    def save(self, data: np.ndarray) -> None:
         """Saves image data to the specified filepath."""
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
         with self._fs.open(save_path, mode="wb") as f:
@@ -435,7 +435,7 @@ The difference between the original `ImageDataset` and the versioned `ImageDatas
 +            glob_function=self._fs.glob,
 +        )
 +
-     def _load(self) -> np.ndarray:
+     def load(self) -> np.ndarray:
          """Loads data from the image file.
 
          Returns:
@@ -447,7 +447,7 @@ The difference between the original `ImageDataset` and the versioned `ImageDatas
              image = Image.open(f).convert("RGBA")
              return np.asarray(image)
 
-     def _save(self, data: np.ndarray) -> None:
+     def save(self, data: np.ndarray) -> None:
          """Saves image data to the specified filepath."""
 -        save_path = get_filepath_str(self._filepath, self._protocol)
 +        save_path = get_filepath_str(self._get_save_path(), self._protocol)

diff --git a/docs/source/deployment/aws_step_functions.md b/docs/source/deployment/aws_step_functions.md
@@ -156,7 +156,7 @@ This file acts as the handler for each Lambda function in our pipeline, receives
 ```Dockerfile
 # Define global args
 ARG FUNCTION_DIR="/home/app/"
-ARG RUNTIME_VERSION="3.8"
+ARG RUNTIME_VERSION="3.9"
 
 # Stage 1 - bundle base image + runtime
 # Grab a fresh copy of the image and install GCC