diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..ebc5ece --- /dev/null +++ b/404.html @@ -0,0 +1,665 @@ + + + +
+ + + + + + + + + + + +Higher-order data modules to run unsupervised domain adaption experiments.
+ + + +AdaptionDataset
+
+
+
+ Bases: Dataset
A torch dataset for unsupervised domain adaption. The +dataset takes a labeled source and one or multiple unlabeled target dataset and combines them.
+For each label/features pair from the source dataset, a random sample of features +is drawn from each target dataset. The datasets are supposed to provide a sample +as a tuple of tensors. The target datasets' labels are assumed to be the last +element of the tuple and are omitted. The datasets length is determined by the +source dataset. This setup can be used to train with common unsupervised domain +adaption methods like DAN, DANN or JAN.
+ +Examples:
+>>> import torch
+>>> import rul_datasets
+>>> source = torch.utils.data.TensorDataset(torch.randn(10), torch.randn(10))
+>>> target = torch.utils.data.TensorDataset(torch.randn(10), torch.randn(10))
+>>> dataset = rul_datasets.adaption.AdaptionDataset(source, target)
+>>> source_features, source_label, target_features = dataset[0]
+
__init__(labeled, *unlabeled, deterministic=False)
+
+Create a new adaption data set from a labeled source and one or multiple +unlabeled target dataset.
+By default, a random sample is drawn from each target dataset when a source
+sample is accessed. This is the recommended setting for training. To
+deactivate this behavior and fix the pairing of source and target samples,
+set deterministic
to True
. This is the recommended setting for evaluation.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
labeled |
+
+ Dataset
+ |
+ The dataset from the labeled domain. |
+ + required + | +
*unlabeled |
+
+ Dataset
+ |
+ The dataset(s) from the unlabeled domain(s). |
+
+ ()
+ |
+
deterministic |
+
+ bool
+ |
+ Return the same target sample for each source sample. |
+
+ False
+ |
+
DomainAdaptionDataModule
+
+
+
+ Bases: pl.LightningDataModule
A higher-order data module used for +unsupervised domain adaption of a labeled source to an unlabeled target domain. +The training data of both domains is wrapped in a AdaptionDataset which provides a random sample of the +target domain with each sample of the source domain. It provides the validation and +test splits of both domains, and optionally a paired dataset for both.
+ +Examples:
+>>> import rul_datasets
+>>> fd1 = rul_datasets.CmapssReader(fd=1, window_size=20)
+>>> fd2 = rul_datasets.CmapssReader(fd=2, percent_broken=0.8)
+>>> source = rul_datasets.RulDataModule(fd1, 32)
+>>> target = rul_datasets.RulDataModule(fd2, 32)
+>>> dm = rul_datasets.DomainAdaptionDataModule(source, target)
+>>> dm.prepare_data()
+>>> dm.setup()
+>>> train_1_2 = dm.train_dataloader()
+>>> val_1, val_2 = dm.val_dataloader()
+>>> test_1, test_2 = dm.test_dataloader()
+
__init__(source, target, paired_val=False, inductive=False)
+
+Create a new domain adaption data module from a source and target +RulDataModule. The source domain is considered +labeled and the target domain unlabeled.
+The source and target data modules are checked for compatability (see
+RulDataModule). These
+checks include that the fd
differs between them, as they come from the same
+domain otherwise.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
source |
+
+ RulDataModule
+ |
+ The data module of the labeled source domain. |
+ + required + | +
target |
+
+ RulDataModule
+ |
+ The data module of the unlabeled target domain. |
+ + required + | +
paired_val |
+
+ bool
+ |
+ Whether to include paired data in validation. |
+
+ False
+ |
+
inductive |
+
+ bool
+ |
+ Whether to use the target test set for training. |
+
+ False
+ |
+
prepare_data(*args, **kwargs)
+
+Download and pre-process the underlying data.
+This calls the prepare_data
function for source and target domain. All
+previously completed preparation steps are skipped. It is called
+automatically by pytorch_lightning
and executed on the first GPU in
+distributed mode.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Passed down to each data module's |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Passed down to each data module's |
+
+ {}
+ |
+
setup(stage=None)
+
+Load source and target domain into memory.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
stage |
+
+ Optional[str]
+ |
+ Passed down to each data module's |
+
+ None
+ |
+
test_dataloader(*args, **kwargs)
+
+Create a data loader of the source and target test data.
+The data loaders are the return values of source.test_dataloader
+and target.test_dataloader
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ List[DataLoader]
+ |
+ The source and target test data loader. |
+
train_dataloader(*args, **kwargs)
+
+Create a data loader of an AdaptionDataset using source and target domain.
+The data loader is configured to shuffle the data. The pin_memory
option is
+activated to achieve maximum transfer speed to the GPU.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The training data loader |
+
val_dataloader(*args, **kwargs)
+
+Create a data loader of the source, target and paired validation data.
+By default, two data loaders are returned, which correspond to the source
+and the target validation data loader. An optional third is a data loader of a
+PairedRulDataset using both source and
+target is returned if paired_val
was set to True
in the constructor.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ List[DataLoader]
+ |
+ The source, target and an optional paired validation data loader. |
+
LatentAlignDataModule
+
+
+
+ Bases: DomainAdaptionDataModule
A higher-order data module based on +DomainAdaptionDataModule.
+It is specifically made to work with the latent space alignment approach by Zhang +et al. The training data of both domains is wrapped in a AdaptionDataset which splits the data into healthy and +degrading. For each sample of degrading source data, a random sample of degrading +target data and healthy sample of either source or target data is drawn. The +number of steps in degradation are supplied for each degrading sample, as well. +The data module also provides the validation and test splits of both domains, and +optionally a paired dataset for both.
+ +Examples:
+>>> import rul_datasets
+>>> fd1 = rul_datasets.CmapssReader(fd=1, window_size=20)
+>>> fd2 = rul_datasets.CmapssReader(fd=2, percent_broken=0.8)
+>>> src = rul_datasets.RulDataModule(fd1, 32)
+>>> trg = rul_datasets.RulDataModule(fd2, 32)
+>>> dm = rul_datasets.LatentAlignDataModule(src, trg, split_by_max_rul=125)
+>>> dm.prepare_data()
+>>> dm.setup()
+>>> train_1_2 = dm.train_dataloader()
+>>> val_1, val_2 = dm.val_dataloader()
+>>> test_1, test_2 = dm.test_dataloader()
+
__init__(source, target, paired_val=False, inductive=False, split_by_max_rul=False, split_by_steps=None)
+
+Create a new latent align data module from a source and target +RulDataModule. The source domain is considered +labeled and the target domain unlabeled.
+The source and target data modules are checked for compatability (see
+RulDataModule). These
+checks include that the fd
differs between them, as they come from the same
+domain otherwise.
The healthy and degrading data can be split by either maximum RUL value or +the number of time steps. See split_healthy for more information.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
source |
+
+ RulDataModule
+ |
+ The data module of the labeled source domain. |
+ + required + | +
target |
+
+ RulDataModule
+ |
+ The data module of the unlabeled target domain. |
+ + required + | +
paired_val |
+
+ bool
+ |
+ Whether to include paired data in validation. |
+
+ False
+ |
+
split_by_max_rul |
+
+ bool
+ |
+ Whether to split healthy and degrading by max RUL value. |
+
+ False
+ |
+
split_by_steps |
+
+ Optional[int]
+ |
+ Split the healthy and degrading data after this number of + time steps. |
+
+ None
+ |
+
split_healthy(features, targets, by_max_rul=False, by_steps=None)
+
+Split the feature and target time series into healthy and degrading parts and +return a dataset of each.
+If by_max_rul
is set to True
the time steps with the maximum RUL value in
+each time series is considered healthy. This option is intended for labeled data
+with piece-wise linear RUL functions. If by_steps
is set to an integer,
+the first by_steps
time steps of each series are considered healthy. This
+option is intended for unlabeled data or data with a linear RUL function.
One option has to be set and both are mutually exclusive.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
features |
+
+ Union[List[np.ndarray], List[torch.Tensor]]
+ |
+ List of feature time series. |
+ + required + | +
targets |
+
+ Union[List[np.ndarray], List[torch.Tensor]]
+ |
+ List of target time series. |
+ + required + | +
by_max_rul |
+
+ bool
+ |
+ Whether to split healthy and degrading data by max RUL value. |
+
+ False
+ |
+
by_steps |
+
+ Optional[int]
+ |
+ Split healthy and degrading data after this number of time steps. |
+
+ None
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
healthy |
+ TensorDataset
+ |
+ Dataset of healthy data. |
+
degrading |
+ TensorDataset
+ |
+ Dataset of degrading data. |
+
Higher-order data modules to establish a baseline for transfer learning and domain +adaption experiments.
+ + + +BaselineDataModule
+
+
+
+ Bases: pl.LightningDataModule
A higher-order data module that +takes a RulDataModule. It provides the +training and validation splits of the sub-dataset selected in the underlying data +module but provides the test splits of all available subsets of the dataset. This +makes it easy to evaluate the generalization of a supervised model on all +sub-datasets.
+ +Examples:
+>>> import rul_datasets
+>>> cmapss = rul_datasets.reader.CmapssReader(fd=1)
+>>> dm = rul_datasets.RulDataModule(cmapss, batch_size=32)
+>>> baseline_dm = rul_datasets.BaselineDataModule(dm)
+>>> baseline_dm.prepare_data()
+>>> baseline_dm.setup()
+>>> train_fd1 = baseline_dm.train_dataloader()
+>>> val_fd1 = baseline_dm.val_dataloader()
+>>> test_fd1, test_fd2, test_fd3, test_fd4 = baseline_dm.test_dataloader()
+
__init__(data_module)
+
+Create a new baseline data module from a RulDataModule.
+It will provide a data loader of the underlying data module's training and +validation splits. Additionally, it provides a data loader of the test split +of all sub-datasets.
+The data module keeps the configuration made in the underlying data module.
+The same configuration is then passed on to create RulDataModules for all
+sub-datasets, beside percent_fail_runs
and percent_broken
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_module |
+
+ RulDataModule
+ |
+ the underlying RulDataModule |
+ + required + | +
prepare_data(*args, **kwargs)
+
+Download and pre-process the underlying data.
+This calls the prepare_data
function for all sub-datasets. All
+previously completed preparation steps are skipped. It is called
+automatically by pytorch_lightning
and executed on the first GPU in
+distributed mode.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Passed down to each data module's |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Passed down to each data module's |
+
+ {}
+ |
+
setup(stage=None)
+
+Load all splits as tensors into memory.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
stage |
+
+ Optional[str]
+ |
+ Passed down to each data module's |
+
+ None
+ |
+
test_dataloader(*args, **kwargs)
+
+Return data loaders for all sub-datasets.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Passed down to each data module. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Passed down to each data module. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ List[DataLoader]
+ |
+ The test dataloaders of all sub-datasets. |
+
train_dataloader(*args, **kwargs)
+
+val_dataloader(*args, **kwargs)
+
+Basic data modules for experiments involving only a single subset of any RUL +dataset.
+ + + +PairedRulDataset
+
+
+
+ Bases: IterableDataset
A dataset of sample pairs drawn from the same time series.
+ + + + + +RulDataModule
+
+
+
+ Bases: pl.LightningDataModule
A data module to provide windowed +time series features with RUL targets. It exposes the splits of the underlying +dataset for easy usage with PyTorch and PyTorch Lightning.
+The data module implements the hparams
property used by PyTorch Lightning to
+save hyperparameters to checkpoints. It retrieves the hyperparameters of its
+underlying reader and adds the batch size to them.
If you want to extract features from the windows, you can pass the
+feature_extractor
and window_size
arguments to the constructor. The
+feature_extractor
is a callable that takes a windowed time series as a numpy
+array with the shape [num_windows, window_size, num_features]
and returns
+another numpy array. Depending on window_size
, the expected output shapes for
+the feature_extractor
are:
window_size is None
: [num_new_windows, new_window_size, features]
window_size is not None
: [num_windows, features]
If window_size
is set, the extracted features are re-windowed.
Examples:
+Default
+>>> import rul_datasets
+>>> cmapss = rul_datasets.reader.CmapssReader(fd=1)
+>>> dm = rul_datasets.RulDataModule(cmapss, batch_size=32)
+
With Feature Extractor
+>>> import rul_datasets
+>>> import numpy as np
+>>> cmapss = rul_datasets.reader.CmapssReader(fd=1)
+>>> dm = rul_datasets.RulDataModule(
+... cmapss,
+... batch_size=32,
+... feature_extractor=lambda x: np.mean(x, axis=1),
+... window_size=10
+... )
+
Only Degraded Validation and Test Samples
+>>> import rul_datasets
+>>> cmapss = rul_datasets.reader.CmapssReader(fd=1)
+>>> dm = rul_datasets.RulDataModule(cmapss, 32, degraded_only=["val", "test"])
+
__init__(reader, batch_size, feature_extractor=None, window_size=None, degraded_only=None)
+
+Create a new RUL data module from a reader.
+This data module exposes a training, validation and test data loader for the
+underlying dataset. First, prepare_data
is called to download and
+pre-process the dataset. Afterward, setup_data
is called to load all
+splits into memory.
If a feature_extractor
is supplied, the data module extracts new features
+from each window of the time series. If window_size
is None
,
+it is assumed that the extracted features form a new windows themselves. If
+window_size
is an int, it is assumed that the extracted features are a
+single feature vectors and should be re-windowed. The expected output shapes
+for the feature_extractor
are:
window_size is None
: [num_new_windows, new_window_size, features]
window_size is not None
: [num_windows, features]
The expected input shape for the feature_extractor
is always
+[num_windows, window_size, features]
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
reader |
+
+ AbstractReader
+ |
+ The dataset reader for the desired dataset, e.g., CmapssLoader. |
+ + required + | +
batch_size |
+
+ int
+ |
+ The size of the batches built by the data loaders. |
+ + required + | +
feature_extractor |
+
+ Optional[Callable]
+ |
+ A feature extractor that extracts feature vectors from + windows. |
+
+ None
+ |
+
window_size |
+
+ Optional[int]
+ |
+ The new window size to apply after the feature extractor. |
+
+ None
+ |
+
degraded_only |
+
+ Optional[List[Literal['dev', 'val', 'test']]]
+ |
+ Whether to load only degraded samples for the |
+
+ None
+ |
+
check_compatibility(other)
+
+Check if another RulDataModule is compatible to be used together with this one.
+RulDataModules can be used together in higher-order data modules,
+e.g. AdaptionDataModule. This function checks if other
is compatible to
+this data module to do so. It checks the underlying dataset readers, matching
+batch size, feature extractor and window size. If anything is incompatible,
+this function will raise a ValueError.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
other |
+
+ RulDataModule
+ |
+ The RulDataModule to check compatibility with. |
+ + required + | +
data()
+
+
+ property
+
+
+A dictionary of the training, validation and test splits.
+Each split is a tuple of feature and target tensors.
+The keys are dev
(training split), val
(validation split) and test
+(test split).
fds()
+
+
+ property
+
+
+Index list of the available subsets of the underlying dataset, i.e.
+[1, 2, 3, 4]
for CMAPSS
.
is_mutually_exclusive(other)
+
+Check if the other data module is mutually exclusive to this one. See +AbstractReader.is_mutually_exclusive.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
other |
+
+ RulDataModule
+ |
+ Data module to check exclusivity against. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+ Whether both data modules are mutually exclusive. |
+
load_split(split, alias=None, degraded_only=None)
+
+Load a split from the underlying reader and apply the feature extractor.
+By setting alias, it is possible to load a split aliased as another split, +e.g., load the test split and treat it as the dev split. The data of the split +is loaded, but all pre-processing steps of alias are carried out.
+If degraded_only
is set, only degraded samples are loaded. This is only
+possible if the underlying reader has a max_rul
set or norm_rul
is set to
+True
. The degraded_only
argument takes precedence over the degraded_only
+of the data module.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
split |
+
+ str
+ |
+ The desired split to load. |
+ + required + | +
alias |
+
+ Optional[str]
+ |
+ The split as which the loaded data should be treated. |
+
+ None
+ |
+
degraded_only |
+
+ Optional[bool]
+ |
+ Whether to only load degraded samples. |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Tuple[List[torch.Tensor], List[torch.Tensor]]
+ |
+ The feature and target tensors of the split's runs. |
+
prepare_data(*args, **kwargs)
+
+Download and pre-process the underlying data.
+This calls the prepare_data
function of the underlying reader. All
+previously completed preparation steps are skipped. It is called
+automatically by pytorch_lightning
and executed on the first GPU in
+distributed mode.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
reader()
+
+
+ property
+
+
+The underlying dataset reader.
+ +setup(stage=None)
+
+Load all splits as tensors into memory and optionally apply feature extractor.
+The splits are placed inside the data +property. If a split is empty, a tuple of empty tensors with the correct +number of dimensions is created as a placeholder. This ensures compatibility +with higher-order data modules.
+If the data module was constructed with a feature_extractor
argument,
+the feature windows are passed to the feature extractor. The resulting,
+new features may be re-windowed.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
stage |
+
+ Optional[str]
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ None
+ |
+
test_dataloader(*args, **kwargs)
+
+Create a data loader for the test split.
+The data loader is configured to leave the data unshuffled. The pin_memory
+option is activated to achieve maximum transfer speed to the GPU.
The whole split is held in memory. Therefore, the num_workers
are set to
+zero which uses the main process for creating batches.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The test data loader |
+
to_dataset(split, alias=None)
+
+Create a dataset of a split.
+This convenience function creates a plain tensor dataset to use outside the rul_datasets
library.
The data placed inside the dataset will be from the specified split
. If
+alias
is set, the loaded data will be treated as if from the alias
split.
+For example, one could load the test data and treat them as if it was the
+training data. This may be useful for inductive domain adaption.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
split |
+
+ str
+ |
+ The split to place inside the dataset. |
+ + required + | +
alias |
+
+ Optional[str]
+ |
+ The split the loaded data should be treated as. |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ TensorDataset
+ |
+ A dataset containing the requested split. |
+
train_dataloader(*args, **kwargs)
+
+Create a data loader for the training split.
+The data loader is configured to shuffle the data. The pin_memory
option is
+activated to achieve maximum transfer speed to the GPU. The data loader is also
+configured to drop the last batch of the data if it would only contain one
+sample.
The whole split is held in memory. Therefore, the num_workers
are set to
+zero which uses the main process for creating batches.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The training data loader |
+
val_dataloader(*args, **kwargs)
+
+Create a data loader for the validation split.
+The data loader is configured to leave the data unshuffled. The pin_memory
+option is activated to achieve maximum transfer speed to the GPU.
The whole split is held in memory. Therefore, the num_workers
are set to
+zero which uses the main process for creating batches.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The validation data loader |
+
This module contains the base class for all readers. It is only relevant to people +that want to extend this package with their own dataset.
+ + + +AbstractReader
+
+
+This reader is the abstract base class of all readers.
+In case you want to extend this library with a dataset of your own, you should
+create a subclass of AbstractReader
. It defines the public interface that all
+data modules in this library use. Just inherit from this class implement the
+abstract functions, and you should be good to go.
Please consider contributing your work afterward to help the community.
+ +Examples:
+>>> import rul_datasets
+>>> class MyReader(rul_datasets.reader.AbstractReader):
+... @property
+... def dataset_name(self):
+... return "my_dataset"
+...
+... @property
+... def fds(self):
+... return [1]
+...
+... def prepare_data(self):
+... pass
+...
+... def default_window_size(self, fd):
+... return 30
+...
+... def load_complete_split(self, split, alias):
+... features = [np.random.randn(100, 2, 30) for _ in range(10)]
+... targets = [np.arange(100, 0, -1) for _ in range(10)]
+...
+... return features, targets
+...
+>>> my_reader = MyReader(fd=1)
+>>> features, targets = my_reader.load_split("dev")
+>>> features[0].shape
+(100, 2, 30)
+
__init__(fd, window_size=None, max_rul=None, percent_broken=None, percent_fail_runs=None, truncate_val=False, truncate_degraded_only=False)
+
+Create a new reader. If your reader needs additional input arguments,
+create your own __init__
function and call this one from within as super(
+).__init__(...)
.
For more information about using readers refer to the reader module page.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ int
+ |
+ Index of the selected sub-dataset |
+ + required + | +
window_size |
+
+ Optional[int]
+ |
+ Size of the sliding window. Defaults to 2560. |
+
+ None
+ |
+
max_rul |
+
+ Optional[int]
+ |
+ Maximum RUL value of targets. |
+
+ None
+ |
+
percent_broken |
+
+ Optional[float]
+ |
+ The maximum relative degradation per time series. |
+
+ None
+ |
+
percent_fail_runs |
+
+ Optional[Union[float, List[int]]]
+ |
+ The percentage or index list of available time series. |
+
+ None
+ |
+
truncate_val |
+
+ bool
+ |
+ Truncate the validation data with |
+
+ False
+ |
+
truncate_degraded_only |
+
+ bool
+ |
+ Only truncate the degraded part of the data + (< max RUL). |
+
+ False
+ |
+
check_compatibility(other)
+
+Check if the other reader is compatible with this one.
+Compatibility of two readers ensures that training with both will probably +succeed and produce valid results. Two readers are considered compatible, if +they:
+are both children of AbstractReader
+have the same window size
have the same max_rul
If any of these conditions is not met, the readers are considered
+misconfigured and a ValueError
is thrown.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
other |
+
+ AbstractReader
+ |
+ Another reader object. |
+ + required + | +
dataset_name()
+
+
+ abstractmethod
+ property
+
+
+Name of the dataset.
+ +default_window_size(fd)
+
+
+ abstractmethod
+
+
+The default window size of the data set. This may vary from sub-dataset to +sub-dataset.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ int
+ |
+ The index of a sub-dataset. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ int
+ |
+ The default window size for the sub-dataset. |
+
fds()
+
+
+ abstractmethod
+ property
+
+
+The indices of available sub-datasets.
+ +get_compatible(fd=None, percent_broken=None, percent_fail_runs=None, truncate_val=None, consolidate_window_size='override')
+
+Create a new reader of the desired sub-dataset that is compatible to this one +(see check_compatibility). Useful for +domain adaption.
+The values for percent_broken
, percent_fail_runs
and truncate_val
of
+the new reader can be overridden.
When constructing a compatible reader for another sub-dataset, the window
+size of this reader will be used to override the default window size of the
+new reader. This behavior can be changed by setting consolidate_window_size
+to "min"
. The window size of this reader and the new one will be set to the
+minimum of this readers window size and the default window size of the new
+reader. Please be aware that this will change the window size of this
+reader, too. If the new reader should use its default window size,
+set consolidate_window_size
to "none"
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ Optional[int]
+ |
+ The index of the sub-dataset for the new reader. |
+
+ None
+ |
+
percent_broken |
+
+ Optional[float]
+ |
+ Override this value in the new reader. |
+
+ None
+ |
+
percent_fail_runs |
+
+ Union[float, List[int], None]
+ |
+ Override this value in the new reader. |
+
+ None
+ |
+
truncate_val |
+
+ Optional[bool]
+ |
+ Override this value in the new reader. |
+
+ None
+ |
+
consolidate_window_size |
+
+ Literal['override', 'min', 'none']
+ |
+ How to consolidate the window size of the readers. |
+
+ 'override'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ AbstractReader
+ |
+ A compatible reader with optional overrides. |
+
get_complement(percent_broken=None, truncate_val=None)
+
+Get a compatible reader that contains all development runs that are not in +this reader (see check_compatibility). Useful for +semi-supervised learning.
+The new reader will contain the development runs that were discarded in this
+reader due to truncation through percent_fail_runs
. If percent_fail_runs
+was not set or this reader contains all development runs, it returns a reader
+with an empty development set.
The values for percent_broken
, and truncate_val
of the new reader can be
+overridden.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
percent_broken |
+
+ Optional[float]
+ |
+ Override this value in the new reader. |
+
+ None
+ |
+
truncate_val |
+
+ Optional[bool]
+ |
+ Override this value in the new reader. |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ AbstractReader
+ |
+ A compatible reader with all development runs missing in this one. |
+
hparams()
+
+
+ property
+
+
+All information logged by the data modules as hyperparameters in PyTorch +Lightning.
+ +is_mutually_exclusive(other)
+
+Check if this reader is mutually exclusive to another reader.
+Two readers are mutually exclusive if:
+percent_fail_runs
arguments do not overlap (float arguments overlap
+ if they are greater than zero)Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
other |
+
+ AbstractReader
+ |
+ The reader to check exclusivity against. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+ Whether the readers are mutually exclusive. |
+
load_complete_split(split, alias)
+
+
+ abstractmethod
+
+
+Load a complete split without truncation.
+This function should return the features and targets of the desired split.
+Both should be contained in a list of numpy arrays. Each of the arrays
+contains one time series. The features should have a shape of [num_windows,
+window_size, num_channels]
and the targets a shape of [num_windows]
. The
+features should be scaled as desired. The targets should be capped by
+max_rul
.
By setting alias
, it should be possible to load a split aliased as another
+split, e.g. load the test split and treat it as the dev split. The data of
+split
should be loaded but all pre-processing steps of alias
should be
+carried out.
This function is used internally in load_split which takes care of +truncation.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
split |
+
+ str
+ |
+ The name of the split to load. |
+ + required + | +
alias |
+
+ str
+ |
+ The split as which the loaded data should be treated. |
+ + required + | +
Returns:
+Name | Type | +Description | +
---|---|---|
features |
+ List[np.ndarray]
+ |
+ The complete, scaled features of the desired split. |
+
targets |
+ List[np.ndarray]
+ |
+ The capped target values corresponding to the features. |
+
load_split(split, alias=None)
+
+Load a split as tensors and apply truncation to it.
+This function loads the scaled features and the targets of a split into
+memory. Afterwards, truncation is applied if the split
is set to dev
. The
+validation set is also truncated with percent_broken
if truncate_val
is
+set to True
.
By setting alias
, it is possible to load a split aliased as another split,
+e.g. load the test split and treat it as the dev split. The data of split
+is loaded but all pre-processing steps of alias
are carried out.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
split |
+
+ str
+ |
+ The desired split to load. |
+ + required + | +
alias |
+
+ Optional[str]
+ |
+ The split as which the loaded data should be treated. |
+
+ None
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
features |
+ List[np.ndarray]
+ |
+ The scaled, truncated features of the desired split. |
+
targets |
+ List[np.ndarray]
+ |
+ The truncated targets of the desired split. |
+
prepare_data()
+
+
+ abstractmethod
+
+
+Prepare the data. This function should take care of things that need to be +done once, before the data can be used. This may include downloading, +extracting or transforming the data, as well as fitting scalers. It is best +practice to check if a preparation step was completed before to avoid +repeating it unnecessarily.
+ +The NASA CMAPSS Turbofan Degradation dataset is a collection of simulated +degradation experiments on jet engines. It contains four sub-datasets named FD1, FD2, +FD3 and FD4 which differ in operation conditions and possible failure types.
+ + + +CmapssReader
+
+
+
+ Bases: AbstractReader
This reader represents the NASA CMAPSS Turbofan Degradation dataset. Each of its +four sub-datasets contains a training and a test split. Upon first usage, +the training split will be further divided into a development and a validation +split. 20% of the original training split is reserved for validation.
+The features are provided as sliding windows over each time series in the +dataset. The label of a window is the label of its last time step. The RUL labels +are capped by a maximum value. The original data contains 24 channels per time +step. Following the literature, we omit the constant channels and operation +condition channels by default. Therefore, the default channel indices are 4, 5, +6, 9, 10, 11, 13, 14, 15, 16, 17, 19, 22 and 23.
+The features are min-max scaled between -1 and 1. The scaler is fitted on the +development data only.
+ +Examples:
+Default channels
+>>> import rul_datasets
+>>> fd1 = rul_datasets.reader.CmapssReader(fd=1, window_size=30)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(163, 30, 14)
+
Custom channels
+>>> import rul_datasets
+>>> fd1 = rul_datasets.reader.CmapssReader(fd=1, feature_select=[1, 2, 3])
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(163, 30, 3)
+
__init__(fd, window_size=None, max_rul=125, percent_broken=None, percent_fail_runs=None, feature_select=None, truncate_val=False, operation_condition_aware_scaling=False, truncate_degraded_only=False)
+
+Create a new CMAPSS reader for one of the sub-datasets. The maximum RUL value
+is set to 125 by default. The 14 feature channels selected by default can be
+overridden by passing a list of channel indices to feature_select
. The
+default window size is defined per sub-dataset as the minimum time series
+length in the test set.
The data can be scaled separately for each operation condition, as done by +Ragab et al. This only affects FD002 and FD004 due to them having multiple +operation conditions.
+For more information about using readers refer to the reader module page.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ int
+ |
+ Index of the selected sub-dataset |
+ + required + | +
window_size |
+
+ Optional[int]
+ |
+ Size of the sliding window. Default defined per sub-dataset. |
+
+ None
+ |
+
max_rul |
+
+ Optional[int]
+ |
+ Maximum RUL value of targets. |
+
+ 125
+ |
+
percent_broken |
+
+ Optional[float]
+ |
+ The maximum relative degradation per time series. |
+
+ None
+ |
+
percent_fail_runs |
+
+ Optional[Union[float, List[int]]]
+ |
+ The percentage or index list of available time series. |
+
+ None
+ |
+
feature_select |
+
+ Optional[List[int]]
+ |
+ The index list of selected feature channels. |
+
+ None
+ |
+
truncate_val |
+
+ bool
+ |
+ Truncate the validation data with |
+
+ False
+ |
+
operation_condition_aware_scaling |
+
+ bool
+ |
+ Scale data separatly for each + operation condition. |
+
+ False
+ |
+
truncate_degraded_only |
+
+ bool
+ |
+ Only truncate the degraded part of the data + (< max RUL). |
+
+ False
+ |
+
fds()
+
+
+ property
+
+
+Indices of available sub-datasets.
+ +prepare_data()
+
+Prepare the CMAPSS dataset. This function needs to be called before using the +dataset for the first time.
+The dataset is downloaded from a custom mirror and extracted into the data +root directory. The training data is then split into development and +validation set. Afterwards, a scaler is fit on the development features. +Previously completed steps are skipped.
+ +A module for working with the data root directory.
+ + + +get_data_root()
+
+Return the path to the data root directory.
+The default data root is located at ~/.rul-datasets
. You can customize the
+location by setting the environment variable RUL_DATASETS_DATA_ROOT
or by
+calling set_data_root.
A manually set data root must be an already existing directory.
+ +Returns:
+Type | +Description | +
---|---|
+ str
+ |
+ The data root path. |
+
set_data_root(data_root)
+
+Set the data root to the specified location.
+A manually set data root must be an already existing directory.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
data_root |
+
+ str
+ |
+ The new data root location. |
+ + required + | +
This dummy dataset is intended for testing and debugging purposes, not for +benchmarking. If your approach can fit this dataset it means that it is able to learn +how to estimate RUL. It does not mean it is good at it.
+ + + +DummyReader
+
+
+
+ Bases: AbstractReader
This reader represents a simple, small dummy dataset that can be uses to test or +debug RUL estimation approaches. It contains ten runs for each split with a +single feature which makes it easy to hold in memory even on low-end computers. +The dataset is so simple that it can be sufficiently fit by a three-layer +perceptron in less than 50 epochs.
+Each run is randomly generated by sampling a run length between 90 and 110 time
+steps and creating a piece-wise linear RUL function y(t)
with a maximum value of
+max_rul
. The feature x(t)
is then calculated as:
where N(loc, scale)
is a function drawing a sample from a normal distribution
+with a mean of loc
and a standard deviation of scale
. The dev
, val
and
+test
splits are all generated the same way with a different fixed random seed.
+This makes generating the dataset deterministic.
The dummy dataset contains two sub-datasets. The first has uses an offset
of
+0.5 and a noise_factor
of 0.01. The second uses an offset
of 0.75 and a
+noise_factor
of 0.02. Both use a default window size of 10 and are min-max
+scaled between -1 and 1 with a scaler fitted on the dev
split.
Examples:
+>>> import rul_datasets
+>>> fd1 = rul_datasets.reader.DummyReader(fd=1)
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(81, 10, 1)
+
__init__(fd, window_size=None, max_rul=50, percent_broken=None, percent_fail_runs=None, truncate_val=False, truncate_degraded_only=False)
+
+Create a new dummy reader for one of the two sub-datasets. The maximum RUL +value is set to 50 by default. Please be aware that changing this value will +lead to different features, too, as they are calculated based on the RUL +values.
+For more information about using readers, refer to the reader module page.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ int
+ |
+ Index of the selected sub-dataset |
+ + required + | +
window_size |
+
+ Optional[int]
+ |
+ Size of the sliding window. Default defined per sub-dataset. |
+
+ None
+ |
+
max_rul |
+
+ Optional[int]
+ |
+ Maximum RUL value of targets. |
+
+ 50
+ |
+
percent_broken |
+
+ Optional[float]
+ |
+ The maximum relative degradation per time series. |
+
+ None
+ |
+
percent_fail_runs |
+
+ Optional[Union[float, List[int]]]
+ |
+ The percentage or index list of available time series. |
+
+ None
+ |
+
truncate_val |
+
+ bool
+ |
+ Truncate the validation data with |
+
+ False
+ |
+
truncate_degraded_only |
+
+ bool
+ |
+ Only truncate the degraded part of the data + (< max RUL). |
+
+ False
+ |
+
fds()
+
+
+ property
+
+
+Indices of available sub-datasets.
+ +prepare_data()
+
+This function has no effect as there is nothing to prepare.
+ +The FEMTO (PRONOSTIA) Bearing dataset is a collection of run-to-failure +experiments on bearings. Three different operation conditions were used, resulting in +three sub-datasets. Sub-dataset 1 and 2 contain two training runs and five test runs, +while sub-dataset 3 contains only one test run. It was part of the 2012 IEEE +Prognostics Challenge.
+ + + +FemtoReader
+
+
+
+ Bases: AbstractReader
This reader represents the FEMTO (PRONOSTIA) Bearing dataset. Each of its three
+sub-datasets contains a training and a test split. By default, the reader
+constructs a validation split for sub-datasets 1 and 2 each by taking the first
+run of the test split. For sub-dataset 3, the second training run is used for
+validation because only one test run is available. The remaining training data is
+denoted as the development split. This run to split assignment can be overridden
+by setting run_split_dist
.
The features contain windows with three channels. Only the two acceleration +channels are used because the test runs are missing the temperature channel. +These features are standardized to zero mean and one standard deviation. The +scaler is fitted on the development data.
+ +Examples:
+Default splits:
+>>> import rul_datasets
+>>> fd1 = rul_datasets.reader.FemtoReader(fd=1)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(2803, 2560, 2)
+
Custom splits:
+>>> import rul_datasets
+>>> splits = {"dev": [5], "val": [4], "test": [3]}
+>>> fd1 = rul_datasets.reader.FemtoReader(fd=1, run_split_dist=splits)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(2463, 2560, 2)
+
Set first-time-to-predict:
+>>> import rul_datasets
+>>> fttp = [10, 20, 30, 40, 50]
+>>> fd1 = rul_datasets.reader.FemtoReader(fd=1, first_time_to_predict=fttp)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> labels[0][:15]
+array([2793., 2793., 2793., 2793., 2793., 2793., 2793., 2793., 2793.,
+ 2793., 2793., 2792., 2791., 2790., 2789.])
+
__init__(fd, window_size=None, max_rul=None, percent_broken=None, percent_fail_runs=None, truncate_val=False, run_split_dist=None, first_time_to_predict=None, norm_rul=False, truncate_degraded_only=False)
+
+Create a new FEMTO reader for one of the sub-datasets. By default, the RUL +values are not capped. The default window size is 2560.
+Use first_time_to_predict
to set an individual RUL inflection point for
+each run. It should be a list with an integer index for each run. The index
+is the time step after which RUL declines. Before the time step it stays
+constant. The norm_rul
argument can then be used to scale the RUL of each
+run between zero and one.
For more information about using readers refer to the reader module page.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ int
+ |
+ Index of the selected sub-dataset |
+ + required + | +
window_size |
+
+ Optional[int]
+ |
+ Size of the sliding window. Defaults to 2560. |
+
+ None
+ |
+
max_rul |
+
+ Optional[int]
+ |
+ Maximum RUL value of targets. |
+
+ None
+ |
+
percent_broken |
+
+ Optional[float]
+ |
+ The maximum relative degradation per time series. |
+
+ None
+ |
+
percent_fail_runs |
+
+ Optional[Union[float, List[int]]]
+ |
+ The percentage or index list of available time series. |
+
+ None
+ |
+
truncate_val |
+
+ bool
+ |
+ Truncate the validation data with |
+
+ False
+ |
+
run_split_dist |
+
+ Optional[Dict[str, List[int]]]
+ |
+ Dictionary that assigns each run idx to each split. |
+
+ None
+ |
+
first_time_to_predict |
+
+ Optional[List[int]]
+ |
+ The time step for each time series before which RUL + is constant. |
+
+ None
+ |
+
norm_rul |
+
+ bool
+ |
+ Normalize RUL between zero and one. |
+
+ False
+ |
+
truncate_degraded_only |
+
+ bool
+ |
+ Only truncate the degraded part of the data + (< max RUL). |
+
+ False
+ |
+
fds()
+
+
+ property
+
+
+Indices of available sub-datasets.
+ +prepare_data()
+
+Prepare the FEMTO dataset. This function needs to be called before using the +dataset and each custom split for the first time.
+The dataset is downloaded from a custom mirror and extracted into the data +root directory. The whole dataset is converted from CSV files to NPY files to +speed up loading it from disk. Afterwards, a scaler is fit on the development +features. Previously completed steps are skipped.
+ +A module for dataset readers. Currently supported datasets are:
+Readers are the foundation of the RUL Datasets library. They provide access to the +data on disk and convert them into a common format so that other parts of the library +can interact with it. The common format is as follows:
+Each dataset consists of multiple sub-datasets. The indices of these sub-datasets
+are called FD
, following CMAPSS convention.
Each sub-dataset contains a development (dev
), a validation (val
) and test split
+(test
).
Each split contains one or multiple time series of features and RUL targets that +represent run-to-failure experiments.
+At each time step of a time series we have a window of features and a target RUL +value. The target is the RUL value of the last time step of the window.
+A reader class, e.g. the CmapssReader +represents a dataset and can manipulate it to your liking. A reader object has access +to one sub-dataset of the dataset:
+ +The reader object can load the features and targets of each split into memory:
+>>> dev_features, dev_targets = reader.load_split("dev")
+>>> val_features, val_targets = reader.load_split("val")
+>>> test_features, test_targets = reader.load_split("test")
+
The features are a list of numpy arrays where each array has a shape of
+[num_windows, window_size, num_channels]
:
The targets are a list of numpy arrays, too, where each array has a
+shape of [num_windows]
:
Each reader defines a default window size for its data. This can be overridden by the
+window_size
argument:
>>> fd1 = CmapssReader(fd=1, window_size=15)
+>>> features, _ = fd1.load_split("dev")
+>>> features[0].shape
+(163, 15, 14)
+
Some datasets, i.e. CMAPSS, use a piece-wise linear RUL function, where a maximum RUL
+value is defined. The maximum RUL value for a reader can be set via the max_rul
+argument:
>>> fd1 = CmapssReader(fd=1, max_rul=100)
+>>> targets = fd1.load_split("dev")
+>>> max(np.max(t) for t in targets)
+100.0
+
If you want to use a sub-dataset as unlabeled data, e.g. for unsupervised domain
+adaption, it should not contain features from the point of failure. If the data
+contains these features, there would be no reason for it to be unlabeled. The
+percent_broken
argument controls how much data near failure is available. A
+percent_broken
of 0.8
for example means that only the first 80% of each time
+series are available:
>>> fd1 = CmapssReader(fd=1, percent_broken=0.8)
+>>> features, targets = fd1.load_split("dev")
+>>> features[0].shape
+(130, 30, 14])
+>>> np.min(targets[0])
+34.0
+
If you have set a max_rul
you may only want to truncate data that is considered
+degraded, i.e. with a RUL value smaller than max_rul
. You can use the
+truncate_degraded_only
option to do that. This way, the data where the RUL value is
+smaller or equal to (1 - percent_broken) * max_rul
is cut off.
>>> fd1 = CmapssReader(fd=1, percent_broken=0.8, truncate_degraded_only=True)
+>>> features, targets = fd1.load_split("dev")
+>>> features[0].shape
+(138, 30, 14])
+>>> np.min(targets[0])
+26.0
+
You may want to apply the same percent_broken
from your training data to your
+validation data. This is sensible if you do not expect that your algorithm has access
+to labeled validation data in real-life. You can achieve this, by setting
+truncate_val
to True
:
>>> fd1 = CmapssReader(fd=1, percent_broken=0.8, truncate_val=True)
+>>> features, targets = fd1.load_split("val")
+>>> np.min(targets[0])
+44.0
+
Data-driven RUL estimation algorithms are often sensitive to the overall amount of
+training data. The more data is available, the more of its variance is covered. If
+you want to investigate how an algorithm performs in a low-data setting, you can use
+percent_fail_runs
. This argument controls how many runs are used for training. A
+percent_fail_runs
of 0.8
means that 80% of the available training runs are used.
+If you need more controll over which runs are used, you can pass a list of indices to
+use only these runs. This is useful for conducting semi-supervised learning where you
+consider one part of a sub-dataset labeled and the other part unlabeled:
>>> fd1 = CmapssReader(fd=1, percent_fail_runs=0.8)
+>>> features, targets = fd1.load_split("dev")
+>>> len(features)
+64
+>>> fd1 = CmapssReader(fd=1, percent_fail_runs=[0, 5, 40])
+>>> features, targets = fd1.load_split("dev")
+>>> len(features)
+3
+
If you have constructed a reader with a certain percent_fail_runs
, you can get a
+reader containing all other runs by using the get_complement
function:
>>> fd1 = CmapssReader(fd=1, percent_fail_runs=0.8)
+>>> fd1_complement = fd1.get_complement()
+>>> features, targets = fd1_complement.load_split("dev")
+>>> len(features)
+16
+
The effects of percent_broken
and percent_fail_runs
are summarized under the term
+truncation as they effectively truncate the dataset in two dimensions.
The readers for the FEMTO and XJTU-SY datasets have two additional constructor
+arguments. The first_time_to_predict
lets you set an individual maximum RUL value
+per run in the dataset. As both are bearing datasets, the first-time-to-predict is
+defined as the time step where the degradation of the bearing is first noticeable.
+The RUL value before this time step is assumed to be constant. Setting norm_rul
+scales the RUL between [0, 1] per run, as it is best practice when using
+first-time-to-predict.
>>> fttp = [10, 20, 30, 40, 50]
+>>> fd1 = rul_datasets.reader.XjtuSyReader(
+... fd=1, first_time_to_predict=fttp, norm_rul=True
+... )
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> labels[0][:15]
+array([1. , 1. , 1. , 1. , 1. ,
+ 1. , 1. , 1. , 1. , 1. ,
+ 1. , 0.99115044, 0.98230088, 0.97345133, 0.96460177])
+
Readers can be used as is if you just want access to the dataset. If you plan to use +them with PyTorch or PyTorch Lightning, it is recommended to combine them with a +RulDataModule:
+ +For more information, see core module page or the +Libraries page.
+ + + +A module with functions for efficient saving and loading of RUL features and +targets.
+ + + +exists(save_path)
+
+Return if the files resulting from a save
call with save_path
exist.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
save_path |
+
+ str
+ |
+ the |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ bool
+ |
+ Whether the files exist |
+
load(save_path, memmap=False)
+
+Load features and targets of a run from .npy files.
+This method is used to restore runs that were saved with the save function. If the runs are too large for the RAM,
+memmap
can be set to True to avoid reading them completely to memory. This
+results in slower processing, though.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
save_path |
+
+ str
+ |
+ Path that was supplied to the + save function. |
+ + required + | +
memmap |
+
+ bool
+ |
+ whether to use memmap to avoid loading the whole run into memory |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
features |
+ np.ndarray
+ |
+ The feature array saved in |
+
targets |
+ np.ndarray
+ |
+ The target array saved in |
+
load_multiple(save_paths, memmap=False)
+
+Load multiple runs with the load function.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
save_paths |
+
+ List[str]
+ |
+ The list of run files to load. |
+ + required + | +
memmap |
+
+ bool
+ |
+ See load |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
features |
+ List[np.ndarray]
+ |
+ The feature arrays saved in |
+
targets |
+ List[np.ndarray]
+ |
+ The target arrays saved in |
+
save(save_path, features, targets)
+
+Save features and targets of a run to .npy files.
+The arrays are saved to separate .npy files to enable memmap mode in case RAM is
+short. The files are saved as save_path
to the
+load function. If the save_path
does not have
+the .npy file extension .npy will be appended.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
save_path |
+
+ str
+ |
+ The path including file name to save the arrays to. |
+ + required + | +
features |
+
+ np.ndarray
+ |
+ The feature array to save. |
+ + required + | +
targets |
+
+ np.ndarray
+ |
+ The targets array to save. |
+ + required + | +
A module with functions for scaling RUL features.
+ + + +Scaler = Union[scalers.StandardScaler, scalers.MinMaxScaler, scalers.MaxAbsScaler, scalers.RobustScaler]
+
+
+ module-attribute
+
+
+Supported scalers:
+ +OperationConditionAwareScaler
+
+
+
+ Bases: BaseEstimator
, TransformerMixin
This scaler is an ensemble of multiple base scalers, e.g. [ +sklearn.preprocessing.MinMaxScaler][]. It takes an additional operation condition +array while fitting and transforming that controls which base scaler is used. The +operation condition corresponding to a sample is compared against the boundaries +defined during construction of the scaler. If the condition lies between the +first set of boundaries, the first base scaler is used, and so forth. +If any condition does not fall between any boundaries, an exception will be +raised and the boundaries should be adjusted.
+ + + + + +__init__(base_scaler, boundaries)
+
+Create a new scaler aware of operation conditions.
+Each pair in boundaries
represents the lower and upper value of an
+inclusive interval. For each interval a copy of the base_scaler
is
+maintained. If an operation condition value falls inside an interval,
+the corresponding scaler is used. The boundaries have to be mutually exclusive.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
base_scaler |
+
+ Scaler
+ |
+ The scaler that should be used for each condition. |
+ + required + | +
boundaries |
+
+ List[Tuple[float, float]]
+ |
+ The pairs that form the inclusive boundaries of each condition. |
+ + required + | +
n_features_in_()
+
+
+ property
+
+
+Number of expected input features.
+ +partial_fit(features, operation_conditions)
+
+Fit the base scalers partially.
+This function calls partial_fit
on each of the base scalers with the
+samples that fall into the corresponding condition boundaries. If any sample
+does not fall into one of the boundaries, an exception is raised.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
features |
+
+ np.ndarray
+ |
+ The feature array to be scaled. |
+ + required + | +
operation_conditions |
+
+ np.ndarray
+ |
+ The condition values compared against the boundaries. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ OperationConditionAwareScaler
+ |
+ The partially fitted scaler. |
+
transform(features, operation_conditions)
+
+Scale the features with the appropriate condition aware scaler.
+This function calls transform
on each of the base scalers for the
+samples that fall into the corresponding condition boundaries. If any sample
+does not fall into one of the boundaries, an exception is raised.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
features |
+
+ np.ndarray
+ |
+ The features to be scaled. |
+ + required + | +
operation_conditions |
+
+ np.ndarray
+ |
+ The condition values compared against the boundaries. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ np.ndarray
+ |
+ The scaled features. |
+
fit_scaler(features, scaler=None, operation_conditions=None)
+
+Fit a given scaler to the RUL features. If the scaler is omitted, +a StandardScaler will be created.
+If the scaler is an [OperationConditionAwareScaler][
+rul_datasets.reader.scaling.OperationConditionAwareScaler] and
+operation_conditions
are passed, the scaler will be fit aware of operation
+conditions.
The scaler assumes that the last axis of the features are the channels. Only +scalers unaware of operation conditions can be fit with windowed data.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
features |
+
+ List[np.ndarray]
+ |
+ The RUL features. |
+ + required + | +
scaler |
+
+ Optional[Union[Scaler, OperationConditionAwareScaler]]
+ |
+ The scaler to be fit. Defaults to a StandardScaler. |
+
+ None
+ |
+
operation_conditions |
+
+ Optional[List[np.ndarray]]
+ |
+ The operation conditions for condition aware scaling. |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Union[Scaler, OperationConditionAwareScaler]
+ |
+ The fitted scaler. |
+
load_scaler(save_path)
+
+Load a scaler from disk.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
save_path |
+
+ str
+ |
+ The path the scaler was saved to. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Scaler
+ |
+ The loaded scaler. |
+
save_scaler(scaler, save_path)
+
+Save a scaler to disk.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
scaler |
+
+ Scaler
+ |
+ The scaler to be saved. |
+ + required + | +
save_path |
+
+ str
+ |
+ The path to save the scaler to. |
+ + required + | +
scale_features(features, scaler, operation_conditions=None)
+
+Scaler the RUL features with a given scaler.
+The features can have a shape of [num_time_steps, channels]
or [num_windows,
+window_size, channels]
. The scaler needs to work on the channel dimension. If it
+was not fit with the right number of channels, a ValueError
is thrown.
If the scaler is operation condition aware, the operation_conditions
argument
+needs to be passed. Windowed data cannot be fit this way.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
features |
+
+ List[np.ndarray]
+ |
+ The RUL features to be scaled. |
+ + required + | +
scaler |
+
+ Union[Scaler, OperationConditionAwareScaler]
+ |
+ The already fitted scaler. |
+ + required + | +
operation_conditions |
+
+ Optional[List[np.ndarray]]
+ |
+ The operation conditions for condition aware scaling. |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ List[np.ndarray]
+ |
+ The scaled features. |
+
A module with functions for truncating RUL data.
+ + + +truncate_runs(features, targets, percent_broken=None, included_runs=None, degraded_only=False)
+
+Truncate RUL data according to percent_broken
and included_runs
.
RUL data has two dimensions in which it can be truncated: the number of runs and +the length of the runs. Truncating the number of runs limits the inter-run +variety of the data. Truncating the length of the run limits the amount of +available data near failure.
+For more information about truncation, see the reader +module page.
+ +Examples:
+Truncating via percent_broken
>>> import numpy as np
+>>> from rul_datasets.reader.truncating import truncate_runs
+>>> features = [np.random.randn(i*100, 5) for i in range(1, 6)]
+>>> targets = [np.arange(i*100)[::-1] for i in range(1, 6)]
+>>> (features[0].shape, targets[0].shape)
+((100, 5), (100,))
+>>> features, targets = truncate_runs(features, targets, percent_broken=0.8)
+>>> (features[0].shape, targets[0].shape) # runs are shorter
+((80, 5), (80,))
+>>> np.min(targets[0]) # runs contain no failures
+20
+
The XJTU-SY Bearing dataset is a collection of run-to-failure experiments on +bearings. Three different operation conditions were used, resulting in three +sub-datasets. Each sub-dataset contains five runs without an official training/test +split.
+ + + +XjtuSyReader
+
+
+
+ Bases: AbstractReader
This reader represents the XJTU-SY Bearing dataset. Each of its three
+sub-datasets contains five runs. By default, the reader assigns the first two to
+the development, the third to the validation and the remaining two to the test
+split. This run to split assignment can be overridden by setting run_split_dist
.
The features contain windows with two channels of acceleration data which are +standardized to zero mean and one standard deviation. The scaler is fitted on the +development data.
+ +Examples:
+Default splits:
+>>> import rul_datasets
+>>> fd1 = rul_datasets.reader.XjtuSyReader(fd=1)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(123, 32768, 2)
+
Custom splits:
+>>> import rul_datasets
+>>> splits = {"dev": [5], "val": [4], "test": [3]}
+>>> fd1 = rul_datasets.reader.XjtuSyReader(fd=1, run_split_dist=splits)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> features[0].shape
+(52, 32768, 2)
+
Set first-time-to-predict:
+>>> import rul_datasets
+>>> fttp = [10, 20, 30, 40, 50]
+>>> fd1 = rul_datasets.reader.XjtuSyReader(fd=1, first_time_to_predict=fttp)
+>>> fd1.prepare_data()
+>>> features, labels = fd1.load_split("dev")
+>>> labels[0][:15]
+array([113., 113., 113., 113., 113., 113., 113., 113., 113., 113., 113.,
+ 112., 111., 110., 109.])
+
__init__(fd, window_size=None, max_rul=None, percent_broken=None, percent_fail_runs=None, truncate_val=False, run_split_dist=None, first_time_to_predict=None, norm_rul=False, truncate_degraded_only=False)
+
+Create a new XJTU-SY reader for one of the sub-datasets. By default, the RUL +values are not capped. The default window size is 32768.
+Use first_time_to_predict
to set an individual RUL inflection point for
+each run. It should be a list with an integer index for each run. The index
+is the time step after which RUL declines. Before the time step it stays
+constant. The norm_rul
argument can then be used to scale the RUL of each
+run between zero and one.
For more information about using readers, refer to the reader module page.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fd |
+
+ int
+ |
+ Index of the selected sub-dataset |
+ + required + | +
window_size |
+
+ Optional[int]
+ |
+ Size of the sliding window. Defaults to 32768. |
+
+ None
+ |
+
max_rul |
+
+ Optional[int]
+ |
+ Maximum RUL value of targets. |
+
+ None
+ |
+
percent_broken |
+
+ Optional[float]
+ |
+ The maximum relative degradation per time series. |
+
+ None
+ |
+
percent_fail_runs |
+
+ Optional[Union[float, List[int]]]
+ |
+ The percentage or index list of available time series. |
+
+ None
+ |
+
truncate_val |
+
+ bool
+ |
+ Truncate the validation data with |
+
+ False
+ |
+
run_split_dist |
+
+ Optional[Dict[str, List[int]]]
+ |
+ Dictionary that assigns each run idx to each split. |
+
+ None
+ |
+
first_time_to_predict |
+
+ Optional[List[int]]
+ |
+ The time step for each time series before which RUL + is constant. |
+
+ None
+ |
+
norm_rul |
+
+ bool
+ |
+ Normalize RUL between zero and one. |
+
+ False
+ |
+
truncate_degraded_only |
+
+ bool
+ |
+ Only truncate the degraded part of the data + (< max RUL). |
+
+ False
+ |
+
fds()
+
+
+ property
+
+
+Indices of available sub-datasets.
+ +prepare_data()
+
+Prepare the XJTU-SY dataset. This function needs to be called before using the +dataset and each custom split for the first time.
+The dataset is downloaded from a custom mirror and extracted into the data +root directory. The whole dataset is converted com CSV files to NPY files to +speed up loading it from disk. Afterwards, a scaler is fit on the development +features. Previously completed steps are skipped.
+ +A module with higher-order data modules for semi-supervised learning.
+ + + +SemiSupervisedDataModule
+
+
+
+ Bases: pl.LightningDataModule
A higher-order data module used for +semi-supervised learning with a labeled data module and an unlabeled one. It +makes sure that both data modules come from the same sub-dataset.
+ +Examples:
+>>> import rul_datasets
+>>> fd1 = rul_datasets.CmapssReader(fd=1, window_size=20, percent_fail_runs=0.5)
+>>> fd1_complement = fd1.get_complement(percent_broken=0.8)
+>>> labeled = rul_datasets.RulDataModule(fd1, 32)
+>>> unlabeled = rul_datasets.RulDataModule(fd1_complement, 32)
+>>> dm = rul_datasets.SemiSupervisedDataModule(labeled, unlabeled)
+>>> dm.prepare_data()
+>>> dm.setup()
+>>> train_ssl = dm.train_dataloader()
+>>> val = dm.val_dataloader()
+>>> test = dm.test_dataloader()
+
__init__(labeled, unlabeled)
+
+Create a new semi-supervised data module from a labeled and unlabeled +RulDataModule.
+The both data modules are checked for compatability (seeRulDataModule). These
+checks include that the fd
match between them.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
labeled |
+
+ RulDataModule
+ |
+ The data module of the labeled dataset. |
+ + required + | +
unlabeled |
+
+ RulDataModule
+ |
+ The data module of the unlabeled dataset. |
+ + required + | +
prepare_data(*args, **kwargs)
+
+Download and pre-process the underlying data.
+This calls the prepare_data
function for source and target domain. All
+previously completed preparation steps are skipped. It is called
+automatically by pytorch_lightning
and executed on the first GPU in
+distributed mode.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Passed down to each data module's |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Passed down to each data module's |
+
+ {}
+ |
+
setup(stage=None)
+
+Load labeled and unlabeled data into memory.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
stage |
+
+ Optional[str]
+ |
+ Passed down to each data module's |
+
+ None
+ |
+
test_dataloader(*args, **kwargs)
+
+Create a data loader of the labeled test data.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The labeled test data loader. |
+
train_dataloader(*args, **kwargs)
+
+Create a data loader of an AdaptionDataset using labeled and unlabeled.
+The data loader is configured to shuffle the data. The pin_memory
option is
+activated to achieve maximum transfer speed to the GPU.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The training data loader |
+
val_dataloader(*args, **kwargs)
+
+Create a data loader of the labeled validation data.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*args |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ ()
+ |
+
**kwargs |
+
+ Any
+ |
+ Ignored. Only for adhering to parent class interface. |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataLoader
+ |
+ The labeled validation data loader. |
+
extract_windows(seq, window_size)
+
+Extract sliding windows from a sequence.
+The step size is considered to be one, which results in len(seq) - window_size +
+1
extracted windows. The resulting array has the shape [num_windows, window_size,
+num_channels].
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
seq |
+
+ np.ndarray
+ |
+ sequence to extract windows from |
+ + required + | +
window_size |
+
+ int
+ |
+ length of the sliding window |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ np.ndarray
+ |
+ array of sliding windows |
+
get_files_in_path(path, condition=None)
+
+Return the paths of all files in a path that satisfy a condition in alphabetical +order.
+If the condition is None
all files are returned.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
path |
+
+ str
+ |
+ the path to look into |
+ + required + | +
condition |
+
+ Optional[Callable]
+ |
+ the include-condition for files |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ List[str]
+ |
+ all files that satisfy the condition in alphabetical order |
+
get_targets_from_file_paths(file_paths, timestep_from_file_path)
+
+Create the RUL targets based on the file paths of the feature files.
+The function extracts the feature file path from each path. The supplied +conversion function extracts the time step from it. Afterwards the RUL is +calculated by subtracting each time step from the maximum time step plus 1.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_paths |
+
+ Dict[int, List[str]]
+ |
+ runs represented as dict of feature file paths |
+ + required + | +
timestep_from_file_path |
+
+ Callable
+ |
+ Function to convert a feature file path to a time step |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Dict[int, np.ndarray]
+ |
+ A list of RUL target arrays for each run |
+
\n {translation(\"search.result.term.missing\")}: {...missing}\n
\n }\n