Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmarks for KedroDataCatalog and fix tests for DataCatalog #4246

Merged
merged 6 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions benchmarks/benchmark_datacatalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,15 @@ def time_release(self):

def time_add_all(self):
"""Benchmark the time to add all datasets"""
self.catalog.add_all(self.datasets)
# Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
catalog = DataCatalog.from_config(base_catalog)
catalog.add_all(self.datasets)

def time_feed_dict(self):
"""Benchmark the time to add feed dict"""
self.catalog.add_feed_dict(self.feed_dict)
# Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
catalog = DataCatalog.from_config(base_catalog)
catalog.add_feed_dict(self.feed_dict)

def time_list(self):
"""Benchmark the time to list all datasets"""
Expand Down
130 changes: 130 additions & 0 deletions benchmarks/benchmark_kedrodatacatalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
from kedro_datasets.pandas import CSVDataset

from kedro.io import KedroDataCatalog

base_catalog = {
f"dataset_{i}": {
"type": "pandas.CSVDataset",
"filepath": f"data_{i}.csv",
} for i in range(1, 1001)
}
# Add datasets with the same filepath for loading
base_catalog.update({
f"dataset_load_{i}": {
"type": "pandas.CSVDataset",
"filepath": "data.csv",
} for i in range(1, 1001)
})
# Add a factory pattern
base_catalog.update({
"dataset_factory_{placeholder}": {
"type": "pandas.CSVDataset",
"filepath": "data_{placeholder}.csv",
}
})

runtime_patterns = {
"{placeholder}": {
"type": "pandas.CSVDataset",
"filepath": "{placeholder}.csv",
}
}

class TimeKedroDataCatalog:
def setup(self):
self.catalog = KedroDataCatalog.from_config(base_catalog)
self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
self.dataframe.to_csv("data.csv", index=False)
self.datasets = {
f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
}
self.feed_dict = {
f"param_{i}": i for i in range(1, 1001)
}

ElenaKhaustova marked this conversation as resolved.
Show resolved Hide resolved
def time_init(self):
"""Benchmark the time to initialize the catalog"""
KedroDataCatalog.from_config(base_catalog)

def time_contains(self):
"""Benchmark the time to check if a dataset exists"""
for i in range(1,1001):
f"dataset_{i}" in self.catalog

def time_getitem(self):
"""Benchmark the time to get a dataset"""
for i in range(1,1001):
self.catalog[f"dataset_{i}"]


def time_get(self):
"""Benchmark the time to get a dataset"""
for i in range(1,1001):
self.catalog.get(f"dataset_{i}")

def time_iter(self):
"""Benchmark the time to iterate over the catalog"""
for dataset in self.catalog:
pass

def time_keys(self):
"""Benchmark the time to get the keys of the catalog"""
self.catalog.keys()

def time_values(self):
"""Benchmark the time to get the items of the catalog"""
self.catalog.values()

def time_items(self):
"""Benchmark the time to get the items of the catalog"""
self.catalog.items()

def time_setitem(self):
"""Benchmark the time to set a dataset"""
for i in range(1,1001):
self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we please also add setting raw data? So this part of setter was covered:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a separate test for this


def time_setitem_raw(self):
"""Benchmark the time to add a memory dataset"""
for i in range(1,1001):
self.catalog[f"param_{i}"] = self.feed_dict[f"param_{i}"]

def time_save(self):
"""Benchmark the time to save datasets"""
for i in range(1,1001):
self.catalog.save(f"dataset_{i}", self.dataframe)

def time_load(self):
"""Benchmark the time to load datasets"""
for i in range(1,1001):
self.catalog.load(f"dataset_load_{i}")

def time_exists(self):
"""Benchmark the time to check if datasets exist"""
for i in range(1,1001):
self.catalog.exists(f"dataset_{i}")

def time_release(self):
"""Benchmark the time to release datasets"""
for i in range(1,1001):
self.catalog.release(f"dataset_{i}")

def time_list(self):
"""Benchmark the time to list all datasets"""
self.catalog.list()

def time_shallow_copy(self):
"""Benchmark the time to shallow copy the catalog"""
# Will be removed
self.catalog.shallow_copy()

def time_resolve_factory(self):
"""Benchmark the time to resolve factory"""
for i in range(1,1001):
self.catalog.get(f"dataset_factory_{i}")

def time_add_runtime_patterns(self):
"""Benchmark the time to add runtime patterns"""
for i in range(1,1001):
self.catalog.config_resolver.add_runtime_patterns(runtime_patterns)