From 09f9123c55aa964e3fae13617a6dd3cdb7ad6db4 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 21 Oct 2024 11:17:17 +0100 Subject: [PATCH 1/5] Update DataCatalog benchmark tests Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_datacatalog.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py index 15de4ef310..4d00c97ba4 100644 --- a/benchmarks/benchmark_datacatalog.py +++ b/benchmarks/benchmark_datacatalog.py @@ -62,11 +62,15 @@ def time_release(self): def time_add_all(self): """Benchmark the time to add all datasets""" - self.catalog.add_all(self.datasets) + # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError + catalog = DataCatalog.from_config(base_catalog) + catalog.add_all(self.datasets) def time_feed_dict(self): """Benchmark the time to add feed dict""" - self.catalog.add_feed_dict(self.feed_dict) + # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError + catalog = DataCatalog.from_config(base_catalog) + catalog.add_feed_dict(self.feed_dict) def time_list(self): """Benchmark the time to list all datasets""" From 70c7b7eb01f9c1f22fd9f8b37df22954d6c1ce81 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 21 Oct 2024 11:40:03 +0100 Subject: [PATCH 2/5] Add tests for KedroDataCatalog first pass Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_kedrodatacatalog.py | 86 ++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 benchmarks/benchmark_kedrodatacatalog.py diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py new file mode 100644 index 0000000000..f9e41547c0 --- /dev/null +++ b/benchmarks/benchmark_kedrodatacatalog.py @@ -0,0 +1,86 @@ +import pandas as pd +from kedro_datasets.pandas import CSVDataset + +from kedro.io import KedroDataCatalog + +base_catalog = { + f"dataset_{i}": { + "type": "pandas.CSVDataset", + "filepath": f"data_{i}.csv", + } for i in range(1, 1001) +} +# Add datasets with the same filepath for loading +base_catalog.update({ + f"dataset_load_{i}": { + "type": "pandas.CSVDataset", + "filepath": "data.csv", + } for i in range(1, 1001) +}) +# Add a factory pattern +base_catalog.update({ + "dataset_factory_{placeholder}": { + "type": "pandas.CSVDataset", + "filepath": "data_{placeholder}.csv", + } +}) + +class TimeKedroDataCatalog: + def setup(self): + self.catalog = KedroDataCatalog.from_config(base_catalog) + self.dataframe = pd.DataFrame({"column": [1, 2, 3]}) + self.dataframe.to_csv("data.csv", index=False) + self.datasets = { + f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001) + } + self.feed_dict = { + f"param_{i}": i for i in range(1, 1001) + } + + def time_init(self): + """Benchmark the time to initialize the catalog""" + KedroDataCatalog.from_config(base_catalog) + + def time_save(self): + """Benchmark the time to save datasets""" + for i in range(1,1001): + self.catalog.save(f"dataset_{i}", self.dataframe) + + def time_load(self): + """Benchmark the time to load datasets""" + for i in range(1,1001): + self.catalog.load(f"dataset_load_{i}") + + def time_exists(self): + """Benchmark the time to check if datasets exist""" + for i in range(1,1001): + self.catalog.exists(f"dataset_{i}") + + def time_release(self): + """Benchmark the time to release datasets""" + for i in range(1,1001): + self.catalog.release(f"dataset_{i}") + + def time_add_all(self): + """Benchmark the time to add all datasets""" + # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError + catalog = KedroDataCatalog.from_config(base_catalog) + catalog.add_all(self.datasets) + + def time_feed_dict(self): + """Benchmark the time to add feed dict""" + # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError + catalog = KedroDataCatalog.from_config(base_catalog) + catalog.add_feed_dict(self.feed_dict) + + def time_list(self): + """Benchmark the time to list all datasets""" + self.catalog.list() + + def time_shallow_copy(self): + """Benchmark the time to shallow copy the catalog""" + self.catalog.shallow_copy() + + def time_resolve_factory(self): + """Benchmark the time to resolve factory""" + for i in range(1,1001): + self.catalog._get_dataset(f"dataset_factory_{i}") From ca3f4337cf170b4f058510ed20c8060150e931d3 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 21 Oct 2024 11:59:19 +0100 Subject: [PATCH 3/5] Add benchmarks for KedroDataCatalog Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_kedrodatacatalog.py | 33 +++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py index f9e41547c0..7a4b7a1ab0 100644 --- a/benchmarks/benchmark_kedrodatacatalog.py +++ b/benchmarks/benchmark_kedrodatacatalog.py @@ -40,6 +40,26 @@ def time_init(self): """Benchmark the time to initialize the catalog""" KedroDataCatalog.from_config(base_catalog) + def time_contains(self): + """Benchmark the time to check if a dataset exists""" + for i in range(1,1001): + f"dataset_{i}" in self.catalog + + def time_getitem(self): + """Benchmark the time to get a dataset""" + for i in range(1,1001): + self.catalog[f"dataset_{i}"] + + def time_setitem(self): + """Benchmark the time to set a dataset""" + for i in range(1,1001): + self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv") + + def time_getdataset(self): + """Benchmark the time to get a dataset""" + for i in range(1,1001): + self.catalog.get_dataset(f"dataset_{i}") + def time_save(self): """Benchmark the time to save datasets""" for i in range(1,1001): @@ -60,24 +80,13 @@ def time_release(self): for i in range(1,1001): self.catalog.release(f"dataset_{i}") - def time_add_all(self): - """Benchmark the time to add all datasets""" - # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError - catalog = KedroDataCatalog.from_config(base_catalog) - catalog.add_all(self.datasets) - - def time_feed_dict(self): - """Benchmark the time to add feed dict""" - # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError - catalog = KedroDataCatalog.from_config(base_catalog) - catalog.add_feed_dict(self.feed_dict) - def time_list(self): """Benchmark the time to list all datasets""" self.catalog.list() def time_shallow_copy(self): """Benchmark the time to shallow copy the catalog""" + # Will be removed self.catalog.shallow_copy() def time_resolve_factory(self): From 1dd9347614d6902c75612054be757c207de98945 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 21 Oct 2024 15:50:43 +0100 Subject: [PATCH 4/5] Add suggested tests Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_kedrodatacatalog.py | 29 +++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py index 7a4b7a1ab0..95c0c6333b 100644 --- a/benchmarks/benchmark_kedrodatacatalog.py +++ b/benchmarks/benchmark_kedrodatacatalog.py @@ -50,15 +50,38 @@ def time_getitem(self): for i in range(1,1001): self.catalog[f"dataset_{i}"] + + def time_get(self): + """Benchmark the time to get a dataset""" + for i in range(1,1001): + self.catalog.get(f"dataset_{i}") + + def time_iter(self): + """Benchmark the time to iterate over the catalog""" + for dataset in self.catalog: + pass + + def time_keys(self): + """Benchmark the time to get the keys of the catalog""" + self.catalog.keys() + + def time_values(self): + """Benchmark the time to get the items of the catalog""" + self.catalog.values() + + def time_items(self): + """Benchmark the time to get the items of the catalog""" + self.catalog.items() + def time_setitem(self): """Benchmark the time to set a dataset""" for i in range(1,1001): self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv") - def time_getdataset(self): - """Benchmark the time to get a dataset""" + def time_setitem_raw(self): + """Benchmark the time to add a memory dataset""" for i in range(1,1001): - self.catalog.get_dataset(f"dataset_{i}") + self.catalog[f"param_{i}"] = self.feed_dict[f"param_{i}"] def time_save(self): """Benchmark the time to save datasets""" From 1e1a0c86759eec66b8152a349c98bc716f1c9f34 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 21 Oct 2024 17:09:13 +0100 Subject: [PATCH 5/5] Add suggested tests Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_kedrodatacatalog.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py index 95c0c6333b..6844f8eda5 100644 --- a/benchmarks/benchmark_kedrodatacatalog.py +++ b/benchmarks/benchmark_kedrodatacatalog.py @@ -24,6 +24,13 @@ } }) +runtime_patterns = { + "{placeholder}": { + "type": "pandas.CSVDataset", + "filepath": "{placeholder}.csv", + } +} + class TimeKedroDataCatalog: def setup(self): self.catalog = KedroDataCatalog.from_config(base_catalog) @@ -115,4 +122,9 @@ def time_shallow_copy(self): def time_resolve_factory(self): """Benchmark the time to resolve factory""" for i in range(1,1001): - self.catalog._get_dataset(f"dataset_factory_{i}") + self.catalog.get(f"dataset_factory_{i}") + + def time_add_runtime_patterns(self): + """Benchmark the time to add runtime patterns""" + for i in range(1,1001): + self.catalog.config_resolver.add_runtime_patterns(runtime_patterns)