feat: make readers output numpy arrays (#7)

* feat: make readers output numpy arrays * docs: update
tilman151 · Nov 17, 2022 · 0b2cdd9 · 0b2cdd9
1 parent a6e3c9f
commit 0b2cdd9
Show file tree

Hide file tree

Showing 14 changed files with 113 additions and 117 deletions.
diff --git a/docs/use_cases/libraries.md b/docs/use_cases/libraries.md
@@ -110,12 +110,12 @@ import rul_datasets
 cmapss_fd1 = rul_datasets.CmapssReader(fd=1)
 cmapss_fd1.prepare_data() # (1)!
 dev_features, _ = cmapss_fd1.load_split("dev") # (2)!
-dev_data = np.concatenate([np.transpose(f.numpy(), (0, 2, 1)) for f in dev_features]) # (3)!
+dev_data = np.concatenate(dev_features) # (3)!
 
 km = tslearn.clustering.TimeSeriesKMeans(n_clusters=5, metric="dtw")
 km.fit(dev_data)
 ```
 
 1. You need to call `prepare_data` before using the reader. This downloads and pre-processes the dataset if not done already.
-2. This yields a list of tensors with the shape `[len_time_series, num_features, window_size]`.
-3. Convert the list of tensors to a single numpy array with the shape `[num_series, window_size, num_features]`.
+2. This yields a list of numpy arrays with the shape `[len_time_series, window_size, num_features]`.
+3. Concatenate to a single numpy array with the shape `[num_series, window_size, num_features]`.
diff --git a/rul_datasets/core.py b/rul_datasets/core.py
@@ -209,17 +209,18 @@ def _setup_split(self, split: str) -> Tuple[torch.Tensor, torch.Tensor]:
         features, targets = self.reader.load_split(split)
         if features:
             features, targets = self._apply_feature_extractor_per_run(features, targets)
-            cat_features = torch.cat(features)
-            cat_targets = torch.cat(targets)
+            tensor_features, tensor_targets = utils.to_tensor(features, targets)
+            cat_features = torch.cat(tensor_features)
+            cat_targets = torch.cat(tensor_targets)
         else:
             cat_features = torch.empty(0, 0, 0)
             cat_targets = torch.empty(0)
 
         return cat_features, cat_targets
 
     def _apply_feature_extractor_per_run(
-        self, features: List[torch.Tensor], targets: List[torch.Tensor]
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        self, features: List[np.ndarray], targets: List[np.ndarray]
+    ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
         if self.feature_extractor is not None and self.window_size is not None:
             cutoff = self.window_size - 1
             features = [self._apply_feature_extractor(f) for f in features]
@@ -228,12 +229,9 @@ def _apply_feature_extractor_per_run(
 
         return features, targets
 
-    def _apply_feature_extractor(self, features: torch.Tensor) -> torch.Tensor:
-        dtype = features.dtype
-        numpy_features = torch.permute(features, (0, 2, 1)).numpy()
-        extracted = self.feature_extractor(numpy_features)  # type: ignore
-        extracted = utils.extract_windows(extracted, self.window_size)  # type: ignore
-        features = utils.feature_to_tensor(extracted, dtype)
+    def _apply_feature_extractor(self, features: np.ndarray) -> np.ndarray:
+        features = self.feature_extractor(features)  # type: ignore
+        features = utils.extract_windows(features, self.window_size)  # type: ignore
 
         return features
 
@@ -323,7 +321,7 @@ def to_dataset(self, split: str) -> TensorDataset:
 
 
 class PairedRulDataset(IterableDataset):
-    """TODO."""
+    """A dataset of sample pairs drawn from the same time series."""
 
     def __init__(
         self,
@@ -347,8 +345,8 @@ def __init__(
             reader.check_compatibility(self.readers[0])
 
         self._run_domain_idx: np.ndarray
-        self._features: List[torch.Tensor]
-        self._labels: List[torch.Tensor]
+        self._features: List[np.ndarray]
+        self._labels: List[np.ndarray]
         self._prepare_datasets()
 
         self._max_rul = self._get_max_rul()
@@ -480,14 +478,14 @@ def _get_labeled_pair_idx(self) -> Tuple[int, int, int, int, int]:
 
     def _build_pair(
         self,
-        run: torch.Tensor,
+        run: np.ndarray,
         anchor_idx: int,
         query_idx: int,
         distance: int,
         domain_label: int,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        anchors = run[anchor_idx]
-        queries = run[query_idx]
+        anchors = utils.feature_to_tensor(run[anchor_idx], torch.float)
+        queries = utils.feature_to_tensor(run[query_idx], torch.float)
         domain_tensor = torch.tensor(domain_label, dtype=torch.float)
         distances = torch.tensor(distance, dtype=torch.float) / self._max_rul
         distances = torch.clamp_max(distances, max=1)  # max distance is max_rul

diff --git a/rul_datasets/reader/__init__.py b/rul_datasets/reader/__init__.py
@@ -36,24 +36,24 @@
 >>> test_features, test_targets = reader.load_split("test")
 ```
 
-The features are a list of [tensors][torch.Tensor] where each tensor has a shape of
-`[num_windows, num_channels, window_size]`:
+The features are a list of [numpy arrays][numpy.ndarray] where each array has a shape of
+`[num_windows, window_size, num_channels]`:
 
 ```pycon
 >>> type(dev_features)
 <class 'list'>
 >>> dev_features[0].shape
-torch.Size([163, 14, 30])
+(163, 30, 14)
 ```
 
-The targets are a list of [tensors][torch.Tensor], too, where each tensor has a shape
-of `[num_windows]`:
+The targets are a list of [numpy arrays][numpy.ndarrays], too, where each array has a
+shape of `[num_windows]`:
 
 ```pycon
 >>> type(dev_targets)
 <class 'list'>
 >>> dev_targets[0].shape
-torch.Size([163])
+(163,)
 ```
 
 Each reader defines a default window size for its data. This can be overridden by the
@@ -63,7 +63,7 @@
 >>> fd1 = CmapssReader(fd=1, window_size=15)
 >>> features, _ = fd1.load_split("dev")
 >>> features[0].shape
-torch.Size([163, 14, 15])
+(163, 15, 14)
 ```
 
 Some datasets, i.e. CMAPSS, use a piece-wise linear RUL function, where a maximum RUL
@@ -73,8 +73,8 @@
 ```pycon
 >>> fd1 = CmapssReader(fd=1, max_rul=100)
 >>> targets = fd1.load_split("dev")
->>> max(torch.max(t) for t in targets)
-tensor(100.)
+>>> max(np.max(t) for t in targets)
+100.0
 ```
 
 If you want to use a sub-dataset as unlabeled data, e.g. for unsupervised domain
@@ -88,9 +88,9 @@
 >>> fd1 = CmapssReader(fd=1, percent_broken=0.8)
 >>> features, targets = fd1.load_split("dev")
 >>> features[0].shape
-torch.Size([130, 14, 30])
->>> torch.min(targets[0])
-tensor(34.)
+(130, 30, 14])
+>>> np.min(targets[0])
+34.0
 ```
 
 You may want to apply the same `percent_broken` from your training data to your
@@ -101,8 +101,8 @@
 ```pycon
 >>> fd1 = CmapssReader(fd=1, percent_broken=0.8, truncate_val=True)
 >>> features, targets = fd1.load_split("val")
->>> torch.min(targets[0])
-tensor(44.)
+>>> np.min(targets[0])
+44.0
 ```
 
 Data-driven RUL estimation algorithms are often sensitive to the overall amount of
@@ -149,7 +149,7 @@
 ```
 
 For more information, see [core][rul_datasets.core] module page or the
-[Use Cases](/rul-datasets/) page.
+[Libraries](/rul-datasets/use_cases/libraries) page.
 
  """
 

diff --git a/rul_datasets/reader/abstract.py b/rul_datasets/reader/abstract.py
@@ -5,9 +5,7 @@
 from typing import Optional, Union, List, Dict, Any, Iterable, Tuple
 
 import numpy as np
-import torch
 
-from rul_datasets import utils
 from rul_datasets.reader import truncating
 
 
@@ -155,17 +153,14 @@ def load_complete_split(
         """
         raise NotImplementedError
 
-    def load_split(self, split: str) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    def load_split(self, split: str) -> Tuple[List[np.ndarray], List[np.ndarray]]:
         """
         Load a split as tensors and apply truncation to it.
 
         This function loads the scaled features and the targets of a split into
         memory. Afterwards, truncation is applied if the `split` is set to `dev`. The
         validation set is also truncated with `percent_broken` if `truncate_val` is
-        set to `True`. At last, the data is transformed into [tensors][torch.Tensor].
-        While converting to them tensors, the axis of the features are transposed to
-        `[num_windows, num_channels, window_size]` to fit PyTorch's channel first
-        format.
+        set to `True`.
 
         Args:
             split: The desired split to load.
@@ -182,9 +177,8 @@ def load_split(self, split: str) -> Tuple[List[torch.Tensor], List[torch.Tensor]
             features, targets = truncating.truncate_runs(
                 features, targets, self.percent_broken
             )
-        tensor_feats, tensor_targets = utils.to_tensor(features, targets)
 
-        return tensor_feats, tensor_targets
+        return features, targets
 
     def get_compatible(
         self,

diff --git a/rul_datasets/reader/cmapss.py b/rul_datasets/reader/cmapss.py
@@ -44,15 +44,15 @@ class CmapssReader(AbstractReader):
         >>> fd1.prepare_data()
         >>> features, labels = fd1.load_split("dev")
         >>> features[0].shape
-        torch.Size([163, 14, 30])
+        (163, 30, 14)
 
         Custom channels
         >>> import rul_datasets
         >>> fd1 = rul_datasets.reader.CmapssReader(fd=1, feature_select=[1, 2, 3])
         >>> fd1.prepare_data()
         >>> features, labels = fd1.load_split("dev")
         >>> features[0].shape
-        torch.Size([163, 3, 30])
+        (163, 30, 3)
     """
 
     _FMT: str = (

diff --git a/rul_datasets/reader/femto.py b/rul_datasets/reader/femto.py
@@ -44,7 +44,7 @@ class FemtoReader(AbstractReader):
         >>> fd1.prepare_data()
         >>> features, labels = fd1.load_split("dev")
         >>> features[0].shape
-        torch.Size([2803, 2, 2560])
+        (2803, 2560, 2)
 
         Custom splits:
         >>> import rul_datasets
@@ -53,7 +53,7 @@ class FemtoReader(AbstractReader):
         >>> fd1.prepare_data()
         >>> features, labels = fd1.load_split("dev")
         >>> features[0].shape
-        torch.Size([2463, 2, 2560])
+        (2463, 2560, 2)
     """
 
     _FEMTO_ROOT: str = os.path.join(get_data_root(), "FEMTOBearingDataSet")

diff --git a/rul_datasets/reader/xjtu_sy.py b/rul_datasets/reader/xjtu_sy.py
@@ -37,7 +37,7 @@ class XjtuSyReader(AbstractReader):
         >>> fd1.prepare_data()
         >>> features, labels = fd1.load_split("dev")
         >>> features[0].shape
-        torch.Size([123, 2, 32768])
+        (123, 32768, 2)
 
         Custom splits:
         >>> import rul_datasets
@@ -46,7 +46,7 @@ class XjtuSyReader(AbstractReader):
         >>> fd1.prepare_data()
         >>> features, labels = fd1.load_split("dev")
         >>> features[0].shape
-        torch.Size([52, 2, 32768])
+        (52, 32768, 2)
     """
 
     _XJTU_SY_ROOT: str = os.path.join(get_data_root(), "XJTU-SY")

diff --git a/rul_datasets/utils.py b/rul_datasets/utils.py
@@ -105,4 +105,7 @@ def to_tensor(
 
 
 def feature_to_tensor(features: np.ndarray, dtype: torch.dtype) -> torch.Tensor:
-    return torch.tensor(features, dtype=dtype).permute(0, 2, 1)
+    if len(features.shape) == 2:
+        return torch.tensor(features, dtype=dtype).permute(1, 0)
+    else:
+        return torch.tensor(features, dtype=dtype).permute(0, 2, 1)
diff --git a/tests/reader/test_cmapss.py b/tests/reader/test_cmapss.py
@@ -1,3 +1,4 @@
+import numpy as np
 import numpy.testing as npt
 import pytest
 import torch
@@ -29,11 +30,11 @@ def _check_split(self, rul_loader, split, window_size):
             self._assert_run_correct(run, run_target, window_size)
 
     def _assert_run_correct(self, run, run_target, win):
-        assert win == run.shape[2]
-        assert self.NUM_CHANNELS == run.shape[1]
+        assert win == run.shape[1]
+        assert self.NUM_CHANNELS == run.shape[2]
         assert len(run) == len(run_target)
-        assert torch.float32 == run.dtype
-        assert torch.float32 == run_target.dtype
+        assert np.float == run.dtype
+        assert np.float == run_target.dtype
 
     @pytest.mark.parametrize(
         ("fd", "window_size"), [(1, 30), (2, 20), (3, 30), (4, 15)]
@@ -52,7 +53,7 @@ def test_feature_select(self):
         for split in ["dev", "val", "test"]:
             features, _ = dataset.load_split(split)
             for run in features:
-                assert 7 == run.shape[1]
+                assert 7 == run.shape[2]
 
     def test_prepare_data_not_called_for_feature_select(self):
         dataset = reader.CmapssReader(1, feature_select=[4])
@@ -64,15 +65,15 @@ def test_normalization_min_max(self, fd):
         full_dataset = reader.CmapssReader(fd)
         full_dev, full_dev_targets = full_dataset.load_split("dev")
 
-        npt.assert_almost_equal(max(torch.max(r).item() for r in full_dev), 1.0)
-        npt.assert_almost_equal(min(torch.min(r).item() for r in full_dev), -1.0)
+        npt.assert_almost_equal(max(np.max(r) for r in full_dev), 1.0)
+        npt.assert_almost_equal(min(np.min(r) for r in full_dev), -1.0)
 
         trunc_dataset = reader.CmapssReader(fd, percent_fail_runs=0.8)
         trunc_dev, _ = trunc_dataset.load_split("dev")
-        assert max(torch.max(r).item() for r in trunc_dev) <= 1.0
-        assert min(torch.min(r).item() for r in trunc_dev) >= -1.0
+        assert np.round(max(np.max(r).item() for r in trunc_dev), decimals=7) <= 1.0
+        assert np.round(min(np.min(r).item() for r in trunc_dev), decimals=7) >= -1.0
 
         trunc_dataset = reader.CmapssReader(fd, percent_broken=0.2)
         trunc_dev, _ = trunc_dataset.load_split("dev")
-        assert max(torch.max(r).item() for r in trunc_dev) <= 1.0
-        assert min(torch.min(r).item() for r in trunc_dev) >= -1.0
+        assert np.round(max(np.max(r).item() for r in trunc_dev), decimals=7) <= 1.0
+        assert np.round(min(np.min(r).item() for r in trunc_dev), decimals=7) >= -1.0
diff --git a/tests/reader/test_femto.py b/tests/reader/test_femto.py
@@ -26,46 +26,45 @@ def test_run_shape_and_dtype(self, fd, window_size, split):
             self._assert_run_correct(run, run_target, window_size)
 
     def _assert_run_correct(self, run, run_target, win):
-        assert win == run.shape[2]
-        assert self.NUM_CHANNELS == run.shape[1]
+        assert win == run.shape[1]
+        assert self.NUM_CHANNELS == run.shape[2]
         assert len(run) == len(run_target)
-        assert torch.float32 == run.dtype
-        assert torch.float32 == run_target.dtype
+        assert np.float64 == run.dtype
+        assert np.float64 == run_target.dtype
 
     def test_standardization(self):
         for i in range(1, 3):
             full_dataset = reader.FemtoReader(fd=i)
             full_train, full_train_targets = full_dataset.load_split("dev")
 
             npt.assert_almost_equal(
-                0.0, torch.mean(torch.cat(full_train)).item(), decimal=3
+                0.0, np.mean(np.concatenate(full_train)).item(), decimal=3
             )
             npt.assert_almost_equal(
-                1.0, torch.std(torch.cat(full_train)).item(), decimal=3
+                1.0, np.std(np.concatenate(full_train)).item(), decimal=3
             )
 
             truncated_dataset = reader.FemtoReader(fd=i, percent_fail_runs=0.8)
             trunc_train, trunc_train_targets = truncated_dataset.load_split("dev")
             npt.assert_almost_equal(
-                0.0, torch.mean(torch.cat(trunc_train)).item(), decimal=2
+                0.0, np.mean(np.concatenate(trunc_train)).item(), decimal=2
             )
             npt.assert_almost_equal(
-                1.0, torch.std(torch.cat(trunc_train)).item(), decimal=1
+                1.0, np.std(np.concatenate(trunc_train)).item(), decimal=1
             )
 
             # percent_broken is supposed to change the std but not the mean
             truncated_dataset = reader.FemtoReader(fd=i, percent_broken=0.2)
             trunc_train, trunc_train_targets = truncated_dataset.load_split("dev")
             npt.assert_almost_equal(
-                0.0, torch.mean(torch.cat(trunc_train)).item(), decimal=1
+                0.0, np.mean(np.concatenate(trunc_train)).item(), decimal=1
             )
 
     @pytest.mark.parametrize("max_rul", [125, None])
     def test_max_rul(self, max_rul):
         dataset = reader.FemtoReader(fd=1, max_rul=max_rul)
         _, targets = dataset.load_split("dev")
         for t in targets:
-            t = t.numpy()
             if max_rul is None:
                 npt.assert_equal(t, np.arange(len(t), 0, -1))  # is linear
             else: