diff --git a/docs/source/data/data_catalog.md b/docs/source/data/data_catalog.md index 568e66ee4f..3edb94632a 100644 --- a/docs/source/data/data_catalog.md +++ b/docs/source/data/data_catalog.md @@ -200,7 +200,7 @@ cars: In this example, `filepath` is used as the basis of a folder that stores versions of the `cars` dataset. Each time a new version is created by a pipeline run it is stored within `data/01_raw/company/cars.csv//cars.csv`, where `` corresponds to a version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. -By default, `kedro run` loads the latest version of the dataset. However, you can also specify a particular versioned data set with `--load-version` flag as follows: +By default, `kedro run` loads the latest version of the dataset. However, you can also specify a particular versioned dataset with `--load-version` flag as follows: ```bash kedro run --load-versions=cars:YYYY-MM-DDThh.mm.ss.sssZ diff --git a/docs/source/integrations/mlflow.md b/docs/source/integrations/mlflow.md index e2d06a0295..78d3df6c69 100644 --- a/docs/source/integrations/mlflow.md +++ b/docs/source/integrations/mlflow.md @@ -134,7 +134,7 @@ and you would be able to preview it in the MLflow web UI: ``` :::{warning} -If you get a `Failed while saving data to data set MlflowMatplotlibWriter` error, +If you get a `Failed while saving data to dataset MlflowMatplotlibWriter` error, it's probably because you had already executed `kedro run` while the dataset was marked as `versioned: true`. The solution is to cleanup the old `data/08_reporting/dummy_confusion_matrix.png` directory. ::: diff --git a/docs/source/nodes_and_pipelines/run_a_pipeline.md b/docs/source/nodes_and_pipelines/run_a_pipeline.md index 4eaa06c296..2bf1a99383 100644 --- a/docs/source/nodes_and_pipelines/run_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/run_a_pipeline.md @@ -70,13 +70,13 @@ class DryRunner(AbstractRunner): """ def create_default_dataset(self, ds_name: str) -> AbstractDataset: - """Factory method for creating the default data set for the runner. + """Factory method for creating the default dataset for the runner. Args: - ds_name: Name of the missing data set + ds_name: Name of the missing dataset Returns: An instance of an implementation of AbstractDataset to be used - for all unregistered data sets. + for all unregistered datasets. """ return MemoryDataset() diff --git a/docs/source/tutorial/spaceflights_tutorial_faqs.md b/docs/source/tutorial/spaceflights_tutorial_faqs.md index ff09d0ae91..ab6d7b8020 100644 --- a/docs/source/tutorial/spaceflights_tutorial_faqs.md +++ b/docs/source/tutorial/spaceflights_tutorial_faqs.md @@ -7,11 +7,11 @@ If you can't find the answer you need here, [ask the Kedro community for help](h ## How do I resolve these common errors? ### Dataset errors -#### DatasetError: Failed while loading data from data set +#### DatasetError: Failed while loading data from dataset You're [testing whether Kedro can load the raw test data](./set_up_data.md#test-that-kedro-can-load-the-data) and see the following: ```python -DatasetError: Failed while loading data from data set +DatasetError: Failed while loading data from dataset CSVDataset(filepath=...). [Errno 2] No such file or directory: '.../companies.csv' ``` @@ -71,6 +71,6 @@ The above exception was the direct cause of the following exception: Traceback (most recent call last): ... raise DatasetError(message) from exc -kedro.io.core.DatasetError: Failed while loading data from data set CSVDataset(filepath=data/03_primary/model_input_table.csv, save_args={'index': False}). +kedro.io.core.DatasetError: Failed while loading data from dataset CSVDataset(filepath=data/03_primary/model_input_table.csv, save_args={'index': False}). [Errno 2] File b'data/03_primary/model_input_table.csv' does not exist: b'data/03_primary/model_input_table.csv' ``` diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index 62280524bd..32da2376b3 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,11 +1,11 @@ -# Here you can define all your data sets by using simple YAML syntax. +# Here you can define all your datasets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" # Link: https://docs.kedro.org/en/stable/data/data_catalog.html # # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS # -# An example data set definition can look as follows: +# An example dataset definition can look as follows: # #bikes: # type: pandas.CSVDataset @@ -39,7 +39,7 @@ # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: # https://docs.kedro.org/en/stable/data/data_catalog.html # -# This is a data set used by the "Hello World" example pipeline provided with the project +# This is a dataset used by the "Hello World" example pipeline provided with the project # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml index 7fce832f6a..753fe237ed 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml @@ -1,4 +1,4 @@ -# Here you can define credentials for different data sets and environment. +# Here you can define credentials for different datasets and environment. # # # Example: diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py index 024ea394ed..c492614c33 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py @@ -11,7 +11,7 @@ def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> dict[str, Any]: - """Node for splitting the classical Iris data set into training and test + """Node for splitting the classical Iris dataset into training and test sets, each split into features and labels. The split ratio parameter is taken from conf/project/parameters.yml. The data and the parameters will be loaded and provided to your function diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 9697e1bd35..6384fd6138 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,5 +1,5 @@ """``kedro.io`` provides functionality to read and write to a -number of data sets. At the core of the library is the ``AbstractDataset`` class. +number of datasets. At the core of the library is the ``AbstractDataset`` class. """ from __future__ import annotations diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index dc55d18b3c..f722bedb6e 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -90,7 +90,7 @@ def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> An The set of requested credentials. Raises: - KeyError: When a data set with the given name has not yet been + KeyError: When a dataset with the given name has not yet been registered. """ diff --git a/kedro/io/core.py b/kedro/io/core.py index 53b660835c..981e81ccd7 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -71,7 +71,7 @@ class DatasetError(Exception): class DatasetNotFoundError(DatasetError): """``DatasetNotFoundError`` raised by ``DataCatalog`` class in case of - trying to use a non-existing data set. + trying to use a non-existing dataset. """ pass @@ -79,7 +79,7 @@ class DatasetNotFoundError(DatasetError): class DatasetAlreadyExistsError(DatasetError): """``DatasetAlreadyExistsError`` raised by ``DataCatalog`` class in case - of trying to add a data set which already exists in the ``DataCatalog``. + of trying to add a dataset which already exists in the ``DataCatalog``. """ pass @@ -87,7 +87,7 @@ class DatasetAlreadyExistsError(DatasetError): class VersionNotFoundError(DatasetError): """``VersionNotFoundError`` raised by ``AbstractVersionedDataset`` implementations - in case of no load versions available for the data set. + in case of no load versions available for the dataset. """ pass @@ -98,9 +98,9 @@ class VersionNotFoundError(DatasetError): class AbstractDataset(abc.ABC, Generic[_DI, _DO]): - """``AbstractDataset`` is the base class for all data set implementations. + """``AbstractDataset`` is the base class for all dataset implementations. - All data set implementations should extend this abstract class + All dataset implementations should extend this abstract class and implement the methods marked as abstract. If a specific dataset implementation cannot be used in conjunction with the ``ParallelRunner``, such user-defined dataset should have the @@ -156,23 +156,23 @@ def from_config( load_version: str | None = None, save_version: str | None = None, ) -> AbstractDataset: - """Create a data set instance using the configuration provided. + """Create a dataset instance using the configuration provided. Args: name: Data set name. config: Data set config dictionary. load_version: Version string to be used for ``load`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. save_version: Version string to be used for ``save`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. Returns: An instance of an ``AbstractDataset`` subclass. Raises: - DatasetError: When the function fails to create the data set + DatasetError: When the function fails to create the dataset from its config. """ @@ -245,9 +245,9 @@ def load(self: Self) -> _DO: except DatasetError: raise except Exception as exc: - # This exception handling is by design as the composed data sets + # This exception handling is by design as the composed datasets # can throw any type of exception. - message = f"Failed while loading data from data set {self!s}.\n{exc!s}" + message = f"Failed while loading data from dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc load.__annotations__["return"] = load_func.__annotations__.get("return") @@ -271,7 +271,7 @@ def save(self: Self, data: _DI) -> None: except (DatasetError, FileNotFoundError, NotADirectoryError): raise except Exception as exc: - message = f"Failed while saving data to data set {self!s}.\n{exc!s}" + message = f"Failed while saving data to dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc save.__annotations__["data"] = save_func.__annotations__.get("data", Any) @@ -377,7 +377,7 @@ def _describe(self) -> dict[str, Any]: ) def exists(self) -> bool: - """Checks whether a data set's output already exists by calling + """Checks whether a dataset's output already exists by calling the provided _exists() method. Returns: @@ -391,7 +391,7 @@ def exists(self) -> bool: self._logger.debug("Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: - message = f"Failed during exists check for data set {self!s}.\n{exc!s}" + message = f"Failed during exists check for dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc def _exists(self) -> bool: @@ -412,7 +412,7 @@ def release(self) -> None: self._logger.debug("Releasing %s", str(self)) self._release() except Exception as exc: - message = f"Failed during release for data set {self!s}.\n{exc!s}" + message = f"Failed during release for dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc def _release(self) -> None: @@ -438,7 +438,7 @@ def generate_timestamp() -> str: class Version(namedtuple("Version", ["load", "save"])): """This namedtuple is used to provide load and save versions for versioned - data sets. If ``Version.load`` is None, then the latest available version + datasets. If ``Version.load`` is None, then the latest available version is loaded. If ``Version.save`` is None, then save version is formatted as YYYY-MM-DDThh.mm.ss.sssZ of the current timestamp. """ @@ -450,7 +450,7 @@ class Version(namedtuple("Version", ["load", "save"])): "Save version '{}' did not match load version '{}' for {}. This is strongly " "discouraged due to inconsistencies it may cause between 'save' and " "'load' operations. Please refrain from setting exact load version for " - "intermediate data sets where possible to avoid this warning." + "intermediate datasets where possible to avoid this warning." ) _DEFAULT_PACKAGES = ["kedro.io.", "kedro_datasets.", ""] @@ -467,10 +467,10 @@ def parse_dataset_definition( config: Data set config dictionary. It *must* contain the `type` key with fully qualified class name or the class object. load_version: Version string to be used for ``load`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. save_version: Version string to be used for ``save`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. Raises: @@ -522,14 +522,14 @@ def parse_dataset_definition( if not issubclass(class_obj, AbstractDataset): raise DatasetError( f"Dataset type '{class_obj.__module__}.{class_obj.__qualname__}' " - f"is invalid: all data set types must extend 'AbstractDataset'." + f"is invalid: all dataset types must extend 'AbstractDataset'." ) if VERSION_KEY in config: # remove "version" key so that it's not passed - # to the "unversioned" data set constructor + # to the "unversioned" dataset constructor message = ( - "'%s' attribute removed from data set configuration since it is a " + "'%s' attribute removed from dataset configuration since it is a " "reserved word and cannot be directly specified" ) logging.getLogger(__name__).warning(message, VERSION_KEY) @@ -579,10 +579,10 @@ def _local_exists(local_filepath: str) -> bool: # SKIP_IF_NO_SPARK class AbstractVersionedDataset(AbstractDataset[_DI, _DO], abc.ABC): """ - ``AbstractVersionedDataset`` is the base class for all versioned data set + ``AbstractVersionedDataset`` is the base class for all versioned dataset implementations. - All data sets that implement versioning should extend this + All datasets that implement versioning should extend this abstract class and implement the methods marked as abstract. Example: @@ -764,7 +764,7 @@ def save(self: Self, data: _DI) -> None: return save def exists(self) -> bool: - """Checks whether a data set's output already exists by calling + """Checks whether a dataset's output already exists by calling the provided _exists() method. Returns: @@ -780,7 +780,7 @@ def exists(self) -> bool: except VersionNotFoundError: return False except Exception as exc: # SKIP_IF_NO_SPARK - message = f"Failed during exists check for data set {self!s}.\n{exc!s}" + message = f"Failed during exists check for dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc def _release(self) -> None: @@ -938,7 +938,7 @@ def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None ... def exists(self, name: str) -> bool: - """Checks whether registered data set exists by calling its `exists()` method.""" + """Checks whether registered dataset exists by calling its `exists()` method.""" ... def release(self, name: str) -> None: diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index a010f3e852..6f9a678272 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -2,7 +2,7 @@ provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a single point of reference for your calls, -relaying load and save functions to the underlying data sets. +relaying load and save functions to the underlying datasets. """ from __future__ import annotations @@ -35,10 +35,10 @@ def _sub_nonword_chars(dataset_name: str) -> str: - """Replace non-word characters in data set names since Kedro 0.16.2. + """Replace non-word characters in dataset names since Kedro 0.16.2. Args: - dataset_name: The data set name registered in the data catalog. + dataset_name: The dataset name registered in the data catalog. Returns: The name used in `DataCatalog.datasets`. @@ -102,9 +102,9 @@ class DataCatalog: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with - a dictionary of data sets. Then it will act as a single point of reference + a dictionary of datasets. Then it will act as a single point of reference for your calls, relaying load and save functions - to the underlying data sets. + to the underlying datasets. """ def __init__( # noqa: PLR0913 @@ -120,15 +120,15 @@ def __init__( # noqa: PLR0913 """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to - instantiate it with a dictionary of data sets. Then it will act as a + instantiate it with a dictionary of datasets. Then it will act as a single point of reference for your calls, relaying load and save - functions to the underlying data sets. + functions to the underlying datasets. Args: - datasets: A dictionary of data set names and data set instances. + datasets: A dictionary of dataset names and dataset instances. feed_dict: A feed dict with data to be added in memory. - dataset_patterns: A dictionary of data set factory patterns - and corresponding data set configuration. When fetched from catalog configuration + dataset_patterns: A dictionary of dataset factory patterns + and corresponding dataset configuration. When fetched from catalog configuration these patterns will be sorted by: 1. Decreasing specificity (number of characters outside the curly brackets) 2. Decreasing number of placeholders (number of curly bracket pairs) @@ -137,10 +137,10 @@ def __init__( # noqa: PLR0913 pattern provided through the runners if it comes before "default" in the alphabet. Such an overwriting pattern will emit a warning. The `"{default}"` name will not emit a warning. - load_versions: A mapping between data set names and versions - to load. Has no effect on data sets without enabled versioning. + load_versions: A mapping between dataset names and versions + to load. Has no effect on datasets without enabled versioning. save_version: Version string to be used for ``save`` operations - by all data sets with enabled versioning. It must: a) be a + by all datasets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. @@ -216,28 +216,28 @@ def from_config( ``DataCatalog`` with configuration parsed from configuration files. Args: - catalog: A dictionary whose keys are the data set names and + catalog: A dictionary whose keys are the dataset names and the values are dictionaries with the constructor arguments - for classes implementing ``AbstractDataset``. The data set + for classes implementing ``AbstractDataset``. The dataset class to be loaded is specified with the key ``type`` and their - fully qualified class name. All ``kedro.io`` data set can be + fully qualified class name. All ``kedro.io`` dataset can be specified by their class name only, i.e. their module name can be omitted. credentials: A dictionary containing credentials for different - data sets. Use the ``credentials`` key in a ``AbstractDataset`` + datasets. Use the ``credentials`` key in a ``AbstractDataset`` to refer to the appropriate credentials as shown in the example below. load_versions: A mapping between dataset names and versions - to load. Has no effect on data sets without enabled versioning. + to load. Has no effect on datasets without enabled versioning. save_version: Version string to be used for ``save`` operations - by all data sets with enabled versioning. It must: a) be a + by all datasets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. Returns: An instantiated ``DataCatalog`` containing all specified - data sets, created and ready to use. + datasets, created and ready to use. Raises: DatasetError: When the method fails to create any of the data @@ -356,10 +356,10 @@ def _get_dataset( return dataset def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set. + """Loads a registered dataset. Args: - name: A data set to be loaded. + name: A dataset to be loaded. version: Optional argument for concrete data version to be loaded. Works only with versioned datasets. @@ -367,7 +367,7 @@ def load(self, name: str, version: str | None = None) -> Any: The loaded data as configured. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. Example: @@ -398,15 +398,15 @@ def load(self, name: str, version: str | None = None) -> Any: return result def save(self, name: str, data: Any) -> None: - """Save data to a registered data set. + """Save data to a registered dataset. Args: - name: A data set to be saved to. + name: A dataset to be saved to. data: A data object to be saved as configured in the registered - data set. + dataset. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. Example: @@ -438,15 +438,15 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def exists(self, name: str) -> bool: - """Checks whether registered data set exists by calling its `exists()` + """Checks whether registered dataset exists by calling its `exists()` method. Raises a warning and returns False if `exists()` is not implemented. Args: - name: A data set to be checked. + name: A dataset to be checked. Returns: - Whether the data set output exists. + Whether the dataset output exists. """ try: @@ -456,13 +456,13 @@ def exists(self, name: str) -> bool: return dataset.exists() def release(self, name: str) -> None: - """Release any cached data associated with a data set + """Release any cached data associated with a dataset Args: - name: A data set to be checked. + name: A dataset to be checked. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. """ dataset = self._get_dataset(name) @@ -477,15 +477,15 @@ def add( """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. Args: - dataset_name: A unique data set name which has not been + dataset_name: A unique dataset name which has not been registered yet. - dataset: A data set object to be associated with the given data + dataset: A dataset object to be associated with the given data set name. replace: Specifies whether to replace an existing dataset with the same name is allowed. Raises: - DatasetAlreadyExistsError: When a data set with the same name + DatasetAlreadyExistsError: When a dataset with the same name has already been registered. Example: @@ -514,7 +514,7 @@ def add_all( datasets: dict[str, AbstractDataset], replace: bool = False, ) -> None: - """Adds a group of new data sets to the ``DataCatalog``. + """Adds a group of new datasets to the ``DataCatalog``. Args: datasets: A dictionary of dataset names and dataset @@ -523,7 +523,7 @@ def add_all( with the same name is allowed. Raises: - DatasetAlreadyExistsError: When a data set with the same name + DatasetAlreadyExistsError: When a dataset with the same name has already been registered. Example: @@ -597,10 +597,10 @@ def list(self, regex_search: str | None = None) -> list[str]: Args: regex_search: An optional regular expression which can be provided - to limit the data sets returned by a particular pattern. + to limit the datasets returned by a particular pattern. Returns: A list of dataset names available which match the - `regex_search` criteria (if provided). All data set names are returned + `regex_search` criteria (if provided). All dataset names are returned by default. Raises: @@ -610,11 +610,11 @@ def list(self, regex_search: str | None = None) -> list[str]: :: >>> catalog = DataCatalog() - >>> # get data sets where the substring 'raw' is present + >>> # get datasets where the substring 'raw' is present >>> raw_data = catalog.list(regex_search='raw') - >>> # get data sets which start with 'prm' or 'feat' + >>> # get datasets which start with 'prm' or 'feat' >>> feat_eng_data = catalog.list(regex_search='^(prm|feat)') - >>> # get data sets which end with 'time_series' + >>> # get datasets which end with 'time_series' >>> models = catalog.list(regex_search='.+time_series$') """ @@ -622,7 +622,7 @@ def list(self, regex_search: str | None = None) -> list[str]: return list(self._datasets.keys()) if not regex_search.strip(): - self._logger.warning("The empty string will not match any data sets") + self._logger.warning("The empty string will not match any datasets") return [] try: diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py index 043bb67737..d120f74ed2 100644 --- a/kedro/io/lambda_dataset.py +++ b/kedro/io/lambda_dataset.py @@ -11,11 +11,11 @@ class LambdaDataset(AbstractDataset): - """``LambdaDataset`` loads and saves data to a data set. + """``LambdaDataset`` loads and saves data to a dataset. It relies on delegating to specific implementation such as csv, sql, etc. ``LambdaDataset`` class captures Exceptions while performing operations on - composed ``Dataset`` implementations. The composed data set is + composed ``Dataset`` implementations. The composed dataset is responsible for providing information on how to resolve the issue when possible. This information should be available through str(error). @@ -53,7 +53,7 @@ def _to_str(func: Any) -> str | None: def _load(self) -> Any: if not self.__load: raise DatasetError( - "Cannot load data set. No 'load' function " + "Cannot load dataset. No 'load' function " "provided when LambdaDataset was created." ) return self.__load() @@ -61,7 +61,7 @@ def _load(self) -> Any: def _save(self, data: Any) -> None: if not self.__save: raise DatasetError( - "Cannot save to data set. No 'save' function " + "Cannot save to dataset. No 'save' function " "provided when LambdaDataset was created." ) self.__save(data) @@ -86,11 +86,11 @@ def __init__( metadata: dict[str, Any] | None = None, ): """Creates a new instance of ``LambdaDataset`` with references to the - required input/output data set methods. + required input/output dataset methods. Args: - load: Method to load data from a data set. - save: Method to save data to a data set. + load: Method to load data from a dataset. + save: Method to save data to a dataset. exists: Method to check whether output data already exists. release: Method to release any cached information. metadata: Any arbitrary metadata. diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index 1b4bb8a371..1e8eef8452 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -1,4 +1,4 @@ -"""``MemoryDataset`` is a data set implementation which handles in-memory data.""" +"""``MemoryDataset`` is a dataset implementation which handles in-memory data.""" from __future__ import annotations diff --git a/kedro/pipeline/node.py b/kedro/pipeline/node.py index b382bee8cf..a303546279 100644 --- a/kedro/pipeline/node.py +++ b/kedro/pipeline/node.py @@ -59,7 +59,7 @@ def __init__( # noqa: PLR0913 contain only letters, digits, hyphens, underscores and/or fullstops. confirms: Optional name or the list of the names of the datasets that should be confirmed. This will result in calling - ``confirm()`` method of the corresponding data set instance. + ``confirm()`` method of the corresponding dataset instance. Specified dataset names do not necessarily need to be present in the node ``inputs`` or ``outputs``. namespace: Optional node namespace. @@ -601,7 +601,7 @@ def node( # noqa: PLR0913 tags: Optional set of tags to be applied to the node. confirms: Optional name or the list of the names of the datasets that should be confirmed. This will result in calling ``confirm()`` - method of the corresponding data set instance. Specified dataset + method of the corresponding dataset instance. Specified dataset names do not necessarily need to be present in the node ``inputs`` or ``outputs``. namespace: Optional node namespace. diff --git a/kedro/pipeline/pipeline.py b/kedro/pipeline/pipeline.py index ab7365a154..749eea8548 100644 --- a/kedro/pipeline/pipeline.py +++ b/kedro/pipeline/pipeline.py @@ -93,8 +93,8 @@ def __init__( >>> from kedro.pipeline import node >>> >>> # In the following scenario first_ds and second_ds - >>> # are data sets provided by io. Pipeline will pass these - >>> # data sets to first_node function and provides the result + >>> # are datasets provided by io. Pipeline will pass these + >>> # datasets to first_node function and provides the result >>> # to the second_node as input. >>> >>> def first_node(first_ds, second_ds): @@ -247,11 +247,11 @@ def outputs(self) -> set[str]: return self._remove_intermediates(self.all_outputs()) def datasets(self) -> set[str]: - """The names of all data sets used by the ``Pipeline``, + """The names of all datasets used by the ``Pipeline``, including inputs and outputs. Returns: - The set of all pipeline data sets. + The set of all pipeline datasets. """ return self.all_outputs() | self.all_inputs() diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 7626bf8679..4bbcdc9ec5 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -43,7 +43,7 @@ class ParallelRunnerManager(SyncManager): """``ParallelRunnerManager`` is used to create shared ``MemoryDataset`` - objects as default data sets in a pipeline. + objects as default datasets in a pipeline. """ @@ -171,8 +171,8 @@ def _validate_nodes(cls, nodes: Iterable[Node]) -> None: @classmethod def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None: - """Ensure that all data sets are serialisable and that we do not have - any non proxied memory data sets being used as outputs as their content + """Ensure that all datasets are serialisable and that we do not have + any non proxied memory datasets being used as outputs as their content will not be synchronized across threads. """ @@ -190,9 +190,9 @@ def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None if unserialisable: raise AttributeError( - f"The following data sets cannot be used with multiprocessing: " + f"The following datasets cannot be used with multiprocessing: " f"{sorted(unserialisable)}\nIn order to utilize multiprocessing you " - f"need to make sure all data sets are serialisable, i.e. data sets " + f"need to make sure all datasets are serialisable, i.e. datasets " f"should not make use of lambda functions, nested functions, closures " f"etc.\nIf you are using custom decorators ensure they are correctly " f"decorated using functools.wraps()." @@ -209,7 +209,7 @@ def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None if memory_datasets: raise AttributeError( - f"The following data sets are memory data sets: " + f"The following datasets are memory datasets: " f"{sorted(memory_datasets)}\n" f"ParallelRunner does not support output to externally created " f"MemoryDatasets" diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index c888e737cf..57a7aef17f 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -81,7 +81,7 @@ def _run( self._suggest_resume_scenario(pipeline, done_nodes, catalog) raise - # decrement load counts and release any data sets we've finished with + # decrement load counts and release any datasets we've finished with for dataset in node.inputs: load_counts[dataset] -= 1 if load_counts[dataset] < 1 and dataset not in pipeline.inputs(): diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index be73adae2a..789fc96fd1 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,4 +1,4 @@ -# Here you can define all your data sets by using simple YAML syntax. +# Here you can define all your datasets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" # Link: https://docs.kedro.org/en/stable/data/data_catalog.html diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml index b2db154dbc..b9a9cea667 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml @@ -1,4 +1,4 @@ -# Here you can define credentials for different data sets and environment. +# Here you can define credentials for different datasets and environment. # # # Example: diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 4128ad6da2..286a7142fd 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -359,7 +359,7 @@ def test_version_str_repr(self, load_version, save_version): def test_save_and_load(self, my_versioned_dataset, dummy_data): """Test that saved and reloaded data matches the original one for - the versioned data set.""" + the versioned dataset.""" my_versioned_dataset.save(dummy_data) reloaded = my_versioned_dataset.load() assert dummy_data == reloaded @@ -398,14 +398,14 @@ def test_exists_general_exception(self): my_other_versioned_dataset.exists() def test_exists(self, my_versioned_dataset, dummy_data): - """Test `exists` method invocation for versioned data set.""" + """Test `exists` method invocation for versioned dataset.""" assert not my_versioned_dataset.exists() my_versioned_dataset.save(dummy_data) assert my_versioned_dataset.exists() shutil.rmtree(my_versioned_dataset._filepath) def test_prevent_overwrite(self, my_versioned_dataset, dummy_data): - """Check the error when attempting to override the data set if the + """Check the error when attempting to override the dataset if the corresponding json file for a given save version already exists.""" my_versioned_dataset.save(dummy_data) pattern = ( @@ -550,7 +550,7 @@ def test_saving_none(self, my_legacy_dataset): my_legacy_dataset.save(None) def test_saving_invalid_data(self, my_legacy_dataset, dummy_data): - pattern = r"Failed while saving data to data set" + pattern = r"Failed while saving data to dataset" with pytest.raises(DatasetError, match=pattern): my_legacy_dataset.save(pd.DataFrame()) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 54cbdf340d..bbaf6e8c6b 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -168,14 +168,14 @@ def data_catalog_from_config(correct_config): class TestDataCatalog: def test_save_and_load(self, data_catalog, dummy_dataframe): - """Test saving and reloading the data set""" + """Test saving and reloading the dataset""" data_catalog.save("test", dummy_dataframe) reloaded_df = data_catalog.load("test") assert_frame_equal(reloaded_df, dummy_dataframe) def test_add_save_and_load(self, dataset, dummy_dataframe): - """Test adding and then saving and reloading the data set""" + """Test adding and then saving and reloading the dataset""" catalog = DataCatalog(datasets={}) catalog.add("test", dataset) catalog.save("test", dummy_dataframe) @@ -185,7 +185,7 @@ def test_add_save_and_load(self, dataset, dummy_dataframe): def test_add_all_save_and_load(self, dataset, dummy_dataframe): """Test adding all to the data catalog and then saving and reloading - the data set""" + the dataset""" catalog = DataCatalog(datasets={}) catalog.add_all({"test": dataset}) catalog.save("test", dummy_dataframe) @@ -194,34 +194,34 @@ def test_add_all_save_and_load(self, dataset, dummy_dataframe): assert_frame_equal(reloaded_df, dummy_dataframe) def test_load_error(self, data_catalog): - """Check the error when attempting to load a data set + """Check the error when attempting to load a dataset from nonexistent source""" - pattern = r"Failed while loading data from data set CSVDataset" + pattern = r"Failed while loading data from dataset CSVDataset" with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") def test_add_dataset_twice(self, data_catalog, dataset): - """Check the error when attempting to add the data set twice""" + """Check the error when attempting to add the dataset twice""" pattern = r"Dataset 'test' has already been registered" with pytest.raises(DatasetAlreadyExistsError, match=pattern): data_catalog.add("test", dataset) def test_load_from_unregistered(self): - """Check the error when attempting to load unregistered data set""" + """Check the error when attempting to load unregistered dataset""" catalog = DataCatalog(datasets={}) pattern = r"Dataset 'test' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern): catalog.load("test") def test_save_to_unregistered(self, dummy_dataframe): - """Check the error when attempting to save to unregistered data set""" + """Check the error when attempting to save to unregistered dataset""" catalog = DataCatalog(datasets={}) pattern = r"Dataset 'test' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern): catalog.save("test", dummy_dataframe) def test_feed_dict(self, memory_catalog, conflicting_feed_dict): - """Test feed dict overriding some of the data sets""" + """Test feed dict overriding some of the datasets""" memory_catalog.add_feed_dict(conflicting_feed_dict, replace=True) assert "data" in memory_catalog.load("ds1") assert memory_catalog.load("ds1")["data"] == 0 @@ -235,7 +235,7 @@ def test_exists(self, data_catalog, dummy_dataframe): assert data_catalog.exists("test") def test_exists_not_implemented(self, caplog): - """Test calling `exists` on the data set, which didn't implement it""" + """Test calling `exists` on the dataset, which didn't implement it""" catalog = DataCatalog(datasets={"test": LambdaDataset(None, None)}) result = catalog.exists("test") @@ -248,18 +248,18 @@ def test_exists_not_implemented(self, caplog): assert result is False def test_exists_invalid(self, data_catalog): - """Check the error when calling `exists` on invalid data set""" + """Check the error when calling `exists` on invalid dataset""" assert not data_catalog.exists("wrong_key") def test_release_unregistered(self, data_catalog): - """Check the error when calling `release` on unregistered data set""" + """Check the error when calling `release` on unregistered dataset""" pattern = r"Dataset \'wrong_key\' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern) as e: data_catalog.release("wrong_key") assert "did you mean" not in str(e.value) def test_release_unregistered_typo(self, data_catalog): - """Check the error when calling `release` on mistyped data set""" + """Check the error when calling `release` on mistyped dataset""" pattern = ( "Dataset 'text' not found in the catalog" " - did you mean one of these instead: test" @@ -268,7 +268,7 @@ def test_release_unregistered_typo(self, data_catalog): data_catalog.release("text") def test_multi_catalog_list(self, multi_catalog): - """Test data catalog which contains multiple data sets""" + """Test data catalog which contains multiple datasets""" entries = multi_catalog.list() assert "abc" in entries assert "xyz" in entries @@ -284,7 +284,7 @@ def test_multi_catalog_list(self, multi_catalog): ], ) def test_multi_catalog_list_regex(self, multi_catalog, pattern, expected): - """Test that regex patterns filter data sets accordingly""" + """Test that regex patterns filter datasets accordingly""" assert multi_catalog.list(regex_search=pattern) == expected def test_multi_catalog_list_bad_regex(self, multi_catalog): @@ -404,7 +404,7 @@ def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): assert_frame_equal(reloaded_df, dummy_dataframe) def test_config_missing_type(self, correct_config): - """Check the error if type attribute is missing for some data set(s) + """Check the error if type attribute is missing for some dataset(s) in the config""" del correct_config["catalog"]["boats"]["type"] pattern = ( @@ -468,13 +468,13 @@ def test_config_invalid_dataset(self, correct_config): pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.data_catalog.DataCatalog' is invalid: " - "all data set types must extend 'AbstractDataset'" + "all dataset types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**correct_config) def test_config_invalid_arguments(self, correct_config): - """Check the error if the data set config contains invalid arguments""" + """Check the error if the dataset config contains invalid arguments""" correct_config["catalog"]["boats"]["save_and_load_args"] = False pattern = ( r"Dataset 'boats' must only contain arguments valid for " @@ -504,7 +504,7 @@ def test_missing_credentials(self, correct_config): DataCatalog.from_config(**correct_config) def test_link_credentials(self, correct_config, mocker): - """Test credentials being linked to the relevant data set""" + """Test credentials being linked to the relevant dataset""" mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") config = deepcopy(correct_config) del config["catalog"]["boats"] @@ -560,7 +560,7 @@ def test_idempotent_catalog(self, correct_config): assert catalog def test_error_dataset_init(self, bad_config): - """Check the error when trying to instantiate erroneous data set""" + """Check the error when trying to instantiate erroneous dataset""" pattern = r"Failed to instantiate dataset \'bad\' of type '.*BadDataset'" with pytest.raises(DatasetError, match=pattern): DataCatalog.from_config(bad_config, None) @@ -606,7 +606,7 @@ def test_bad_confirm(self, correct_config, dataset_name, pattern): class TestDataCatalogVersioned: def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): - """Test load and save of versioned data sets from config""" + """Test load and save of versioned datasets from config""" correct_config["catalog"]["boats"]["versioned"] = True # Decompose `generate_timestamp` to keep `current_ts` reference. @@ -649,13 +649,13 @@ def test_from_correct_config_versioned_warn( self, caplog, correct_config, versioned ): """Check the warning if `version` attribute was added - to the data set config""" + to the dataset config""" correct_config["catalog"]["boats"]["versioned"] = versioned correct_config["catalog"]["boats"]["version"] = True DataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( - "'version' attribute removed from data set configuration since it " + "'version' attribute removed from dataset configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" @@ -672,7 +672,7 @@ def test_from_correct_config_load_versions_warn(self, correct_config): def test_compare_tracking_and_other_dataset_versioned( self, correct_config_with_tracking_ds, dummy_dataframe ): - """Test saving of tracking data sets from config results in the same + """Test saving of tracking datasets from config results in the same save version as other versioned datasets.""" catalog = DataCatalog.from_config(**correct_config_with_tracking_ds) @@ -694,7 +694,7 @@ def test_compare_tracking_and_other_dataset_versioned( assert tracking_timestamp == csv_timestamp def test_load_version(self, correct_config, dummy_dataframe, mocker): - """Test load versioned data sets from config""" + """Test load versioned datasets from config""" new_dataframe = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) correct_config["catalog"]["boats"]["versioned"] = True mocker.patch( @@ -938,7 +938,7 @@ def test_unmatched_key_error_when_parsing_config( def test_factory_config_versioned( self, config_with_dataset_factories, filepath, dummy_dataframe ): - """Test load and save of versioned data sets from config""" + """Test load and save of versioned datasets from config""" config_with_dataset_factories["catalog"]["{brand}_cars"]["versioned"] = True config_with_dataset_factories["catalog"]["{brand}_cars"]["filepath"] = filepath diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 5e0c463e7d..efa993bb0e 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -74,7 +74,7 @@ def test_add_save_and_load(self, dataset, dummy_dataframe): def test_load_error(self, data_catalog): """Check the error when attempting to load a dataset from nonexistent source""" - pattern = r"Failed while loading data from data set CSVDataset" + pattern = r"Failed while loading data from dataset CSVDataset" with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") @@ -352,7 +352,7 @@ def test_config_invalid_dataset(self, correct_config): pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.kedro_data_catalog.KedroDataCatalog' is invalid: " - "all data set types must extend 'AbstractDataset'" + "all dataset types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): KedroDataCatalog.from_config(**correct_config) @@ -553,7 +553,7 @@ def test_from_correct_config_versioned_warn( KedroDataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( - "'version' attribute removed from data set configuration since it " + "'version' attribute removed from dataset configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" diff --git a/tests/io/test_lambda_dataset.py b/tests/io/test_lambda_dataset.py index a3072af451..eac9709d04 100644 --- a/tests/io/test_lambda_dataset.py +++ b/tests/io/test_lambda_dataset.py @@ -104,7 +104,7 @@ def internal_load(): def test_load_undefined(self): """Check the error if `LambdaDataset.__load` is None""" - with pytest.raises(DatasetError, match="Cannot load data set"): + with pytest.raises(DatasetError, match="Cannot load dataset"): LambdaDataset(None, None).load() def test_load_not_callable(self): @@ -128,7 +128,7 @@ def test_save_raises_error(self, mocked_save, mocked_dataset): mocked_save.side_effect = FileExistsError(error_message) pattern = ( - r"Failed while saving data to data set LambdaDataset\(.+\)\.\n" + r"Failed while saving data to dataset LambdaDataset\(.+\)\.\n" + error_message ) with pytest.raises(DatasetError, match=pattern): @@ -137,7 +137,7 @@ def test_save_raises_error(self, mocked_save, mocked_dataset): def test_save_undefined(self): """Check the error if `LambdaDataset.__save` is None""" - with pytest.raises(DatasetError, match="Cannot save to data set"): + with pytest.raises(DatasetError, match="Cannot save to dataset"): LambdaDataset(None, None).save(42) def test_save_none(self, mocked_save, mocked_dataset): diff --git a/tests/pipeline/test_pipeline_from_missing.py b/tests/pipeline/test_pipeline_from_missing.py index f399e70c06..4e40638d83 100644 --- a/tests/pipeline/test_pipeline_from_missing.py +++ b/tests/pipeline/test_pipeline_from_missing.py @@ -210,7 +210,7 @@ def test_partial_propagation(self, branched_pipeline, hook_manager): assert _pipeline_contains(new_pipeline, ["split", "right_out"]) def test_partial_non_existent_propagation(self, branched_pipeline, hook_manager): - """A non existent data set whose node has one unregistered input + """A non existent dataset whose node has one unregistered input and one existent input should be recalculated correctly. """ catalog = _make_catalog(existent=["A", "C", "E", "F"], non_existent=["D"])