Skip to content

Commit

Permalink
feat: make readers output numpy arrays (#7)
Browse files Browse the repository at this point in the history
* feat: make readers output numpy arrays

* docs: update
  • Loading branch information
tilman151 authored Nov 17, 2022
1 parent a6e3c9f commit 0b2cdd9
Show file tree
Hide file tree
Showing 14 changed files with 113 additions and 117 deletions.
6 changes: 3 additions & 3 deletions docs/use_cases/libraries.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,12 @@ import rul_datasets
cmapss_fd1 = rul_datasets.CmapssReader(fd=1)
cmapss_fd1.prepare_data() # (1)!
dev_features, _ = cmapss_fd1.load_split("dev") # (2)!
dev_data = np.concatenate([np.transpose(f.numpy(), (0, 2, 1)) for f in dev_features]) # (3)!
dev_data = np.concatenate(dev_features) # (3)!

km = tslearn.clustering.TimeSeriesKMeans(n_clusters=5, metric="dtw")
km.fit(dev_data)
```

1. You need to call `prepare_data` before using the reader. This downloads and pre-processes the dataset if not done already.
2. This yields a list of tensors with the shape `[len_time_series, num_features, window_size]`.
3. Convert the list of tensors to a single numpy array with the shape `[num_series, window_size, num_features]`.
2. This yields a list of numpy arrays with the shape `[len_time_series, window_size, num_features]`.
3. Concatenate to a single numpy array with the shape `[num_series, window_size, num_features]`.
30 changes: 14 additions & 16 deletions rul_datasets/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,17 +209,18 @@ def _setup_split(self, split: str) -> Tuple[torch.Tensor, torch.Tensor]:
features, targets = self.reader.load_split(split)
if features:
features, targets = self._apply_feature_extractor_per_run(features, targets)
cat_features = torch.cat(features)
cat_targets = torch.cat(targets)
tensor_features, tensor_targets = utils.to_tensor(features, targets)
cat_features = torch.cat(tensor_features)
cat_targets = torch.cat(tensor_targets)
else:
cat_features = torch.empty(0, 0, 0)
cat_targets = torch.empty(0)

return cat_features, cat_targets

def _apply_feature_extractor_per_run(
self, features: List[torch.Tensor], targets: List[torch.Tensor]
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
self, features: List[np.ndarray], targets: List[np.ndarray]
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
if self.feature_extractor is not None and self.window_size is not None:
cutoff = self.window_size - 1
features = [self._apply_feature_extractor(f) for f in features]
Expand All @@ -228,12 +229,9 @@ def _apply_feature_extractor_per_run(

return features, targets

def _apply_feature_extractor(self, features: torch.Tensor) -> torch.Tensor:
dtype = features.dtype
numpy_features = torch.permute(features, (0, 2, 1)).numpy()
extracted = self.feature_extractor(numpy_features) # type: ignore
extracted = utils.extract_windows(extracted, self.window_size) # type: ignore
features = utils.feature_to_tensor(extracted, dtype)
def _apply_feature_extractor(self, features: np.ndarray) -> np.ndarray:
features = self.feature_extractor(features) # type: ignore
features = utils.extract_windows(features, self.window_size) # type: ignore

return features

Expand Down Expand Up @@ -323,7 +321,7 @@ def to_dataset(self, split: str) -> TensorDataset:


class PairedRulDataset(IterableDataset):
"""TODO."""
"""A dataset of sample pairs drawn from the same time series."""

def __init__(
self,
Expand All @@ -347,8 +345,8 @@ def __init__(
reader.check_compatibility(self.readers[0])

self._run_domain_idx: np.ndarray
self._features: List[torch.Tensor]
self._labels: List[torch.Tensor]
self._features: List[np.ndarray]
self._labels: List[np.ndarray]
self._prepare_datasets()

self._max_rul = self._get_max_rul()
Expand Down Expand Up @@ -480,14 +478,14 @@ def _get_labeled_pair_idx(self) -> Tuple[int, int, int, int, int]:

def _build_pair(
self,
run: torch.Tensor,
run: np.ndarray,
anchor_idx: int,
query_idx: int,
distance: int,
domain_label: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
anchors = run[anchor_idx]
queries = run[query_idx]
anchors = utils.feature_to_tensor(run[anchor_idx], torch.float)
queries = utils.feature_to_tensor(run[query_idx], torch.float)
domain_tensor = torch.tensor(domain_label, dtype=torch.float)
distances = torch.tensor(distance, dtype=torch.float) / self._max_rul
distances = torch.clamp_max(distances, max=1) # max distance is max_rul
Expand Down
30 changes: 15 additions & 15 deletions rul_datasets/reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@
>>> test_features, test_targets = reader.load_split("test")
```
The features are a list of [tensors][torch.Tensor] where each tensor has a shape of
`[num_windows, num_channels, window_size]`:
The features are a list of [numpy arrays][numpy.ndarray] where each array has a shape of
`[num_windows, window_size, num_channels]`:
```pycon
>>> type(dev_features)
<class 'list'>
>>> dev_features[0].shape
torch.Size([163, 14, 30])
(163, 30, 14)
```
The targets are a list of [tensors][torch.Tensor], too, where each tensor has a shape
of `[num_windows]`:
The targets are a list of [numpy arrays][numpy.ndarrays], too, where each array has a
shape of `[num_windows]`:
```pycon
>>> type(dev_targets)
<class 'list'>
>>> dev_targets[0].shape
torch.Size([163])
(163,)
```
Each reader defines a default window size for its data. This can be overridden by the
Expand All @@ -63,7 +63,7 @@
>>> fd1 = CmapssReader(fd=1, window_size=15)
>>> features, _ = fd1.load_split("dev")
>>> features[0].shape
torch.Size([163, 14, 15])
(163, 15, 14)
```
Some datasets, i.e. CMAPSS, use a piece-wise linear RUL function, where a maximum RUL
Expand All @@ -73,8 +73,8 @@
```pycon
>>> fd1 = CmapssReader(fd=1, max_rul=100)
>>> targets = fd1.load_split("dev")
>>> max(torch.max(t) for t in targets)
tensor(100.)
>>> max(np.max(t) for t in targets)
100.0
```
If you want to use a sub-dataset as unlabeled data, e.g. for unsupervised domain
Expand All @@ -88,9 +88,9 @@
>>> fd1 = CmapssReader(fd=1, percent_broken=0.8)
>>> features, targets = fd1.load_split("dev")
>>> features[0].shape
torch.Size([130, 14, 30])
>>> torch.min(targets[0])
tensor(34.)
(130, 30, 14])
>>> np.min(targets[0])
34.0
```
You may want to apply the same `percent_broken` from your training data to your
Expand All @@ -101,8 +101,8 @@
```pycon
>>> fd1 = CmapssReader(fd=1, percent_broken=0.8, truncate_val=True)
>>> features, targets = fd1.load_split("val")
>>> torch.min(targets[0])
tensor(44.)
>>> np.min(targets[0])
44.0
```
Data-driven RUL estimation algorithms are often sensitive to the overall amount of
Expand Down Expand Up @@ -149,7 +149,7 @@
```
For more information, see [core][rul_datasets.core] module page or the
[Use Cases](/rul-datasets/) page.
[Libraries](/rul-datasets/use_cases/libraries) page.
"""

Expand Down
12 changes: 3 additions & 9 deletions rul_datasets/reader/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from typing import Optional, Union, List, Dict, Any, Iterable, Tuple

import numpy as np
import torch

from rul_datasets import utils
from rul_datasets.reader import truncating


Expand Down Expand Up @@ -155,17 +153,14 @@ def load_complete_split(
"""
raise NotImplementedError

def load_split(self, split: str) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
def load_split(self, split: str) -> Tuple[List[np.ndarray], List[np.ndarray]]:
"""
Load a split as tensors and apply truncation to it.
This function loads the scaled features and the targets of a split into
memory. Afterwards, truncation is applied if the `split` is set to `dev`. The
validation set is also truncated with `percent_broken` if `truncate_val` is
set to `True`. At last, the data is transformed into [tensors][torch.Tensor].
While converting to them tensors, the axis of the features are transposed to
`[num_windows, num_channels, window_size]` to fit PyTorch's channel first
format.
set to `True`.
Args:
split: The desired split to load.
Expand All @@ -182,9 +177,8 @@ def load_split(self, split: str) -> Tuple[List[torch.Tensor], List[torch.Tensor]
features, targets = truncating.truncate_runs(
features, targets, self.percent_broken
)
tensor_feats, tensor_targets = utils.to_tensor(features, targets)

return tensor_feats, tensor_targets
return features, targets

def get_compatible(
self,
Expand Down
4 changes: 2 additions & 2 deletions rul_datasets/reader/cmapss.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ class CmapssReader(AbstractReader):
>>> fd1.prepare_data()
>>> features, labels = fd1.load_split("dev")
>>> features[0].shape
torch.Size([163, 14, 30])
(163, 30, 14)
Custom channels
>>> import rul_datasets
>>> fd1 = rul_datasets.reader.CmapssReader(fd=1, feature_select=[1, 2, 3])
>>> fd1.prepare_data()
>>> features, labels = fd1.load_split("dev")
>>> features[0].shape
torch.Size([163, 3, 30])
(163, 30, 3)
"""

_FMT: str = (
Expand Down
4 changes: 2 additions & 2 deletions rul_datasets/reader/femto.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class FemtoReader(AbstractReader):
>>> fd1.prepare_data()
>>> features, labels = fd1.load_split("dev")
>>> features[0].shape
torch.Size([2803, 2, 2560])
(2803, 2560, 2)
Custom splits:
>>> import rul_datasets
Expand All @@ -53,7 +53,7 @@ class FemtoReader(AbstractReader):
>>> fd1.prepare_data()
>>> features, labels = fd1.load_split("dev")
>>> features[0].shape
torch.Size([2463, 2, 2560])
(2463, 2560, 2)
"""

_FEMTO_ROOT: str = os.path.join(get_data_root(), "FEMTOBearingDataSet")
Expand Down
4 changes: 2 additions & 2 deletions rul_datasets/reader/xjtu_sy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class XjtuSyReader(AbstractReader):
>>> fd1.prepare_data()
>>> features, labels = fd1.load_split("dev")
>>> features[0].shape
torch.Size([123, 2, 32768])
(123, 32768, 2)
Custom splits:
>>> import rul_datasets
Expand All @@ -46,7 +46,7 @@ class XjtuSyReader(AbstractReader):
>>> fd1.prepare_data()
>>> features, labels = fd1.load_split("dev")
>>> features[0].shape
torch.Size([52, 2, 32768])
(52, 32768, 2)
"""

_XJTU_SY_ROOT: str = os.path.join(get_data_root(), "XJTU-SY")
Expand Down
5 changes: 4 additions & 1 deletion rul_datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,7 @@ def to_tensor(


def feature_to_tensor(features: np.ndarray, dtype: torch.dtype) -> torch.Tensor:
return torch.tensor(features, dtype=dtype).permute(0, 2, 1)
if len(features.shape) == 2:
return torch.tensor(features, dtype=dtype).permute(1, 0)
else:
return torch.tensor(features, dtype=dtype).permute(0, 2, 1)
23 changes: 12 additions & 11 deletions tests/reader/test_cmapss.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import numpy.testing as npt
import pytest
import torch
Expand Down Expand Up @@ -29,11 +30,11 @@ def _check_split(self, rul_loader, split, window_size):
self._assert_run_correct(run, run_target, window_size)

def _assert_run_correct(self, run, run_target, win):
assert win == run.shape[2]
assert self.NUM_CHANNELS == run.shape[1]
assert win == run.shape[1]
assert self.NUM_CHANNELS == run.shape[2]
assert len(run) == len(run_target)
assert torch.float32 == run.dtype
assert torch.float32 == run_target.dtype
assert np.float == run.dtype
assert np.float == run_target.dtype

@pytest.mark.parametrize(
("fd", "window_size"), [(1, 30), (2, 20), (3, 30), (4, 15)]
Expand All @@ -52,7 +53,7 @@ def test_feature_select(self):
for split in ["dev", "val", "test"]:
features, _ = dataset.load_split(split)
for run in features:
assert 7 == run.shape[1]
assert 7 == run.shape[2]

def test_prepare_data_not_called_for_feature_select(self):
dataset = reader.CmapssReader(1, feature_select=[4])
Expand All @@ -64,15 +65,15 @@ def test_normalization_min_max(self, fd):
full_dataset = reader.CmapssReader(fd)
full_dev, full_dev_targets = full_dataset.load_split("dev")

npt.assert_almost_equal(max(torch.max(r).item() for r in full_dev), 1.0)
npt.assert_almost_equal(min(torch.min(r).item() for r in full_dev), -1.0)
npt.assert_almost_equal(max(np.max(r) for r in full_dev), 1.0)
npt.assert_almost_equal(min(np.min(r) for r in full_dev), -1.0)

trunc_dataset = reader.CmapssReader(fd, percent_fail_runs=0.8)
trunc_dev, _ = trunc_dataset.load_split("dev")
assert max(torch.max(r).item() for r in trunc_dev) <= 1.0
assert min(torch.min(r).item() for r in trunc_dev) >= -1.0
assert np.round(max(np.max(r).item() for r in trunc_dev), decimals=7) <= 1.0
assert np.round(min(np.min(r).item() for r in trunc_dev), decimals=7) >= -1.0

trunc_dataset = reader.CmapssReader(fd, percent_broken=0.2)
trunc_dev, _ = trunc_dataset.load_split("dev")
assert max(torch.max(r).item() for r in trunc_dev) <= 1.0
assert min(torch.min(r).item() for r in trunc_dev) >= -1.0
assert np.round(max(np.max(r).item() for r in trunc_dev), decimals=7) <= 1.0
assert np.round(min(np.min(r).item() for r in trunc_dev), decimals=7) >= -1.0
19 changes: 9 additions & 10 deletions tests/reader/test_femto.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,46 +26,45 @@ def test_run_shape_and_dtype(self, fd, window_size, split):
self._assert_run_correct(run, run_target, window_size)

def _assert_run_correct(self, run, run_target, win):
assert win == run.shape[2]
assert self.NUM_CHANNELS == run.shape[1]
assert win == run.shape[1]
assert self.NUM_CHANNELS == run.shape[2]
assert len(run) == len(run_target)
assert torch.float32 == run.dtype
assert torch.float32 == run_target.dtype
assert np.float64 == run.dtype
assert np.float64 == run_target.dtype

def test_standardization(self):
for i in range(1, 3):
full_dataset = reader.FemtoReader(fd=i)
full_train, full_train_targets = full_dataset.load_split("dev")

npt.assert_almost_equal(
0.0, torch.mean(torch.cat(full_train)).item(), decimal=3
0.0, np.mean(np.concatenate(full_train)).item(), decimal=3
)
npt.assert_almost_equal(
1.0, torch.std(torch.cat(full_train)).item(), decimal=3
1.0, np.std(np.concatenate(full_train)).item(), decimal=3
)

truncated_dataset = reader.FemtoReader(fd=i, percent_fail_runs=0.8)
trunc_train, trunc_train_targets = truncated_dataset.load_split("dev")
npt.assert_almost_equal(
0.0, torch.mean(torch.cat(trunc_train)).item(), decimal=2
0.0, np.mean(np.concatenate(trunc_train)).item(), decimal=2
)
npt.assert_almost_equal(
1.0, torch.std(torch.cat(trunc_train)).item(), decimal=1
1.0, np.std(np.concatenate(trunc_train)).item(), decimal=1
)

# percent_broken is supposed to change the std but not the mean
truncated_dataset = reader.FemtoReader(fd=i, percent_broken=0.2)
trunc_train, trunc_train_targets = truncated_dataset.load_split("dev")
npt.assert_almost_equal(
0.0, torch.mean(torch.cat(trunc_train)).item(), decimal=1
0.0, np.mean(np.concatenate(trunc_train)).item(), decimal=1
)

@pytest.mark.parametrize("max_rul", [125, None])
def test_max_rul(self, max_rul):
dataset = reader.FemtoReader(fd=1, max_rul=max_rul)
_, targets = dataset.load_split("dev")
for t in targets:
t = t.numpy()
if max_rul is None:
npt.assert_equal(t, np.arange(len(t), 0, -1)) # is linear
else:
Expand Down
Loading

0 comments on commit 0b2cdd9

Please sign in to comment.