From 09f9123c55aa964e3fae13617a6dd3cdb7ad6db4 Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Date: Mon, 21 Oct 2024 11:17:17 +0100
Subject: [PATCH 1/5] Update DataCatalog benchmark tests

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
---
 benchmarks/benchmark_datacatalog.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py
index 15de4ef310..4d00c97ba4 100644
--- a/benchmarks/benchmark_datacatalog.py
+++ b/benchmarks/benchmark_datacatalog.py
@@ -62,11 +62,15 @@ def time_release(self):
 
     def time_add_all(self):
         """Benchmark the time to add all datasets"""
-        self.catalog.add_all(self.datasets)
+        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
+        catalog = DataCatalog.from_config(base_catalog)
+        catalog.add_all(self.datasets)
 
     def time_feed_dict(self):
         """Benchmark the time to add feed dict"""
-        self.catalog.add_feed_dict(self.feed_dict)
+        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
+        catalog = DataCatalog.from_config(base_catalog)
+        catalog.add_feed_dict(self.feed_dict)
 
     def time_list(self):
         """Benchmark the time to list all datasets"""

From 70c7b7eb01f9c1f22fd9f8b37df22954d6c1ce81 Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Date: Mon, 21 Oct 2024 11:40:03 +0100
Subject: [PATCH 2/5] Add tests for KedroDataCatalog first pass

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
---
 benchmarks/benchmark_kedrodatacatalog.py | 86 ++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 benchmarks/benchmark_kedrodatacatalog.py

diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py
new file mode 100644
index 0000000000..f9e41547c0
--- /dev/null
+++ b/benchmarks/benchmark_kedrodatacatalog.py
@@ -0,0 +1,86 @@
+import pandas as pd
+from kedro_datasets.pandas import CSVDataset
+
+from kedro.io import KedroDataCatalog
+
+base_catalog = {
+    f"dataset_{i}": {
+        "type": "pandas.CSVDataset",
+        "filepath": f"data_{i}.csv",
+    } for i in range(1, 1001)
+}
+# Add datasets with the same filepath for loading
+base_catalog.update({
+    f"dataset_load_{i}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "data.csv",
+    } for i in range(1, 1001)
+})
+# Add a factory pattern
+base_catalog.update({
+    "dataset_factory_{placeholder}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "data_{placeholder}.csv",
+    }
+})
+
+class TimeKedroDataCatalog:
+    def setup(self):
+        self.catalog = KedroDataCatalog.from_config(base_catalog)
+        self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
+        self.dataframe.to_csv("data.csv", index=False)
+        self.datasets = {
+            f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
+        }
+        self.feed_dict = {
+            f"param_{i}": i for i in range(1, 1001)
+        }
+
+    def time_init(self):
+        """Benchmark the time to initialize the catalog"""
+        KedroDataCatalog.from_config(base_catalog)
+
+    def time_save(self):
+        """Benchmark the time to save datasets"""
+        for i in range(1,1001):
+            self.catalog.save(f"dataset_{i}", self.dataframe)
+
+    def time_load(self):
+        """Benchmark the time to load datasets"""
+        for i in range(1,1001):
+            self.catalog.load(f"dataset_load_{i}")
+
+    def time_exists(self):
+        """Benchmark the time to check if datasets exist"""
+        for i in range(1,1001):
+            self.catalog.exists(f"dataset_{i}")
+
+    def time_release(self):
+        """Benchmark the time to release datasets"""
+        for i in range(1,1001):
+            self.catalog.release(f"dataset_{i}")
+
+    def time_add_all(self):
+        """Benchmark the time to add all datasets"""
+        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
+        catalog = KedroDataCatalog.from_config(base_catalog)
+        catalog.add_all(self.datasets)
+
+    def time_feed_dict(self):
+        """Benchmark the time to add feed dict"""
+        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
+        catalog = KedroDataCatalog.from_config(base_catalog)
+        catalog.add_feed_dict(self.feed_dict)
+
+    def time_list(self):
+        """Benchmark the time to list all datasets"""
+        self.catalog.list()
+
+    def time_shallow_copy(self):
+        """Benchmark the time to shallow copy the catalog"""
+        self.catalog.shallow_copy()
+
+    def time_resolve_factory(self):
+        """Benchmark the time to resolve factory"""
+        for i in range(1,1001):
+            self.catalog._get_dataset(f"dataset_factory_{i}")

From ca3f4337cf170b4f058510ed20c8060150e931d3 Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Date: Mon, 21 Oct 2024 11:59:19 +0100
Subject: [PATCH 3/5] Add benchmarks for KedroDataCatalog

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
---
 benchmarks/benchmark_kedrodatacatalog.py | 33 +++++++++++++++---------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py
index f9e41547c0..7a4b7a1ab0 100644
--- a/benchmarks/benchmark_kedrodatacatalog.py
+++ b/benchmarks/benchmark_kedrodatacatalog.py
@@ -40,6 +40,26 @@ def time_init(self):
         """Benchmark the time to initialize the catalog"""
         KedroDataCatalog.from_config(base_catalog)
 
+    def time_contains(self):
+        """Benchmark the time to check if a dataset exists"""
+        for i in range(1,1001):
+            f"dataset_{i}" in self.catalog
+
+    def time_getitem(self):
+        """Benchmark the time to get a dataset"""
+        for i in range(1,1001):
+            self.catalog[f"dataset_{i}"]
+
+    def time_setitem(self):
+        """Benchmark the time to set a dataset"""
+        for i in range(1,1001):
+            self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv")
+
+    def time_getdataset(self):
+        """Benchmark the time to get a dataset"""
+        for i in range(1,1001):
+            self.catalog.get_dataset(f"dataset_{i}")
+
     def time_save(self):
         """Benchmark the time to save datasets"""
         for i in range(1,1001):
@@ -60,24 +80,13 @@ def time_release(self):
         for i in range(1,1001):
             self.catalog.release(f"dataset_{i}")
 
-    def time_add_all(self):
-        """Benchmark the time to add all datasets"""
-        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
-        catalog = KedroDataCatalog.from_config(base_catalog)
-        catalog.add_all(self.datasets)
-
-    def time_feed_dict(self):
-        """Benchmark the time to add feed dict"""
-        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
-        catalog = KedroDataCatalog.from_config(base_catalog)
-        catalog.add_feed_dict(self.feed_dict)
-
     def time_list(self):
         """Benchmark the time to list all datasets"""
         self.catalog.list()
 
     def time_shallow_copy(self):
         """Benchmark the time to shallow copy the catalog"""
+        # Will be removed
         self.catalog.shallow_copy()
 
     def time_resolve_factory(self):

From 1dd9347614d6902c75612054be757c207de98945 Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Date: Mon, 21 Oct 2024 15:50:43 +0100
Subject: [PATCH 4/5] Add suggested tests

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
---
 benchmarks/benchmark_kedrodatacatalog.py | 29 +++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py
index 7a4b7a1ab0..95c0c6333b 100644
--- a/benchmarks/benchmark_kedrodatacatalog.py
+++ b/benchmarks/benchmark_kedrodatacatalog.py
@@ -50,15 +50,38 @@ def time_getitem(self):
         for i in range(1,1001):
             self.catalog[f"dataset_{i}"]
 
+
+    def time_get(self):
+        """Benchmark the time to get a dataset"""
+        for i in range(1,1001):
+            self.catalog.get(f"dataset_{i}")
+
+    def time_iter(self):
+        """Benchmark the time to iterate over the catalog"""
+        for dataset in self.catalog:
+            pass
+
+    def time_keys(self):
+        """Benchmark the time to get the keys of the catalog"""
+        self.catalog.keys()
+
+    def time_values(self):
+        """Benchmark the time to get the items of the catalog"""
+        self.catalog.values()
+
+    def time_items(self):
+        """Benchmark the time to get the items of the catalog"""
+        self.catalog.items()
+
     def time_setitem(self):
         """Benchmark the time to set a dataset"""
         for i in range(1,1001):
             self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv")
 
-    def time_getdataset(self):
-        """Benchmark the time to get a dataset"""
+    def time_setitem_raw(self):
+        """Benchmark the time to add a memory dataset"""
         for i in range(1,1001):
-            self.catalog.get_dataset(f"dataset_{i}")
+            self.catalog[f"param_{i}"] = self.feed_dict[f"param_{i}"]
 
     def time_save(self):
         """Benchmark the time to save datasets"""

From 1e1a0c86759eec66b8152a349c98bc716f1c9f34 Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Date: Mon, 21 Oct 2024 17:09:13 +0100
Subject: [PATCH 5/5] Add suggested tests

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
---
 benchmarks/benchmark_kedrodatacatalog.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py
index 95c0c6333b..6844f8eda5 100644
--- a/benchmarks/benchmark_kedrodatacatalog.py
+++ b/benchmarks/benchmark_kedrodatacatalog.py
@@ -24,6 +24,13 @@
     }
 })
 
+runtime_patterns = {
+    "{placeholder}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "{placeholder}.csv",
+    }
+}
+
 class TimeKedroDataCatalog:
     def setup(self):
         self.catalog = KedroDataCatalog.from_config(base_catalog)
@@ -115,4 +122,9 @@ def time_shallow_copy(self):
     def time_resolve_factory(self):
         """Benchmark the time to resolve factory"""
         for i in range(1,1001):
-            self.catalog._get_dataset(f"dataset_factory_{i}")
+            self.catalog.get(f"dataset_factory_{i}")
+
+    def time_add_runtime_patterns(self):
+        """Benchmark the time to add runtime patterns"""
+        for i in range(1,1001):
+            self.catalog.config_resolver.add_runtime_patterns(runtime_patterns)