From d51e56b101c0fbb9d4659dcbf80f0640d97de3c7 Mon Sep 17 00:00:00 2001 From: Tilman Krokotsch Date: Thu, 26 Jan 2023 10:31:34 +0100 Subject: [PATCH] feat: make re-windowing after feature extraction optional (#18) * refactor: move tests to pytest * feat: make re-windowing after feature extraction optional --- docs/use_cases/feature_extraction.md | 15 +- rul_datasets/core.py | 44 +++-- tests/test_core.py | 234 ++++++++++++++------------- 3 files changed, 159 insertions(+), 134 deletions(-) diff --git a/docs/use_cases/feature_extraction.md b/docs/use_cases/feature_extraction.md index 869968f..21a6ee6 100644 --- a/docs/use_cases/feature_extraction.md +++ b/docs/use_cases/feature_extraction.md @@ -3,9 +3,14 @@ It may be useful to extract hand-crafted features, i.e. RMS or P2P, from this vi The [RulDataModule][rul_datasets.core.RulDataModule] provides the option to use a custom feature extractor on each window of data. The feature extractor can be anything that can be called as a function. -It should take a numpy array with the shape `[num_windows, window_size, num_features]` and return an array with the shape `[num_windows, num_new_features]`. +It should take a numpy array with the shape `[num_windows, window_size, num_features]` and return another array. +Depending on whether a `window_size` is supplied to the data module, the expected output shape of the feature extractor is: + +* `window_size is None`: `[num_new_windows, new_window_size, features]` +* `window_size is not None`: `[num_windows, features]` + An example would be taking the mean of the window with `lambda x: np.mean(x, axis=1)`. -After applying the feature extractor, the data module extracts new windows of extracted features: +Because this function reduces the windows to a single feature vector, we set `window_size` to 10 to get new windows of that size: ```pycon >>> import rul_datasets @@ -43,4 +48,8 @@ The number of samples will reduce by `num_runs * (window_size - 1)` due to the r 3674 >>> dm_extracted.to_dataset("dev") 3656 -``` \ No newline at end of file +``` + +If your feature extractor produces windows itself, you can set `window_size` to `None`. +This way, no new windows are extracted. +An example would be extracting multiple sub-windows from the existing windows. \ No newline at end of file diff --git a/rul_datasets/core.py b/rul_datasets/core.py index 102dad0..c928768 100644 --- a/rul_datasets/core.py +++ b/rul_datasets/core.py @@ -27,8 +27,13 @@ class RulDataModule(pl.LightningDataModule): `feature_extractor` and `window_size` arguments to the constructor. The `feature_extractor` is a callable that takes a windowed time series as a numpy array with the shape `[num_windows, window_size, num_features]` and returns - another numpy array with the shape `[num_windows, num_new_features]`. The time - series of extracted features is then re-windowed with `window_size`. + another numpy array. Depending on `window_size`, the expected output shapes for + the `feature_extractor` are: + + * `window_size is None`: `[num_new_windows, new_window_size, features]` + * `window_size is not None`: `[num_windows, features]` + + If `window_size` is set, the extracted features are re-windowed. Examples: Default @@ -67,8 +72,18 @@ def __init__( pre-process the dataset. Afterwards, `setup_data` is called to load all splits into memory. - If `feature_extractor` and `window_size` are supplied, the data module extracts - new features from each window of the time series and re-windows it afterwards. + If a `feature_extractor` is supplied, the data module extracts new features + from each window of the time series. If `window_size` is `None`, + it is assumed that the extracted features form a new windows themselves. If + `window_size` is an int, it is assumed that the extracted features are a + single feature vectors and should be re-windowed. The expected output shapes + for the `feature_extractor` are: + + * `window_size is None`: `[num_new_windows, new_window_size, features]` + * `window_size is not None`: `[num_windows, features]` + + The expected input shape for the `feature_extractor` is always + `[num_windows, window_size, features]`. Args: reader: The dataset reader for the desired dataset, e.g. CmapssLoader. @@ -84,10 +99,10 @@ def __init__( self.feature_extractor = feature_extractor self.window_size = window_size - if (self.feature_extractor is not None) != (self.window_size is not None): + if (self.feature_extractor is None) and (self.window_size is not None): raise ValueError( - "feature_extractor and window_size cannot be set without " - "the other. Please supply values for both." + "A feature extractor has to be supplied " + "to set a window size for re-windowing." ) hparams = deepcopy(self.reader.hparams) @@ -194,7 +209,7 @@ def setup(self, stage: Optional[str] = None) -> None: If the data module was constructed with a `feature_extractor` argument, the feature windows are passed to the feature extractor. The resulting, - new features are re-windowed. + new features may be re-windowed. Args: stage: Ignored. Only for adhering to parent class interface. @@ -221,20 +236,15 @@ def _setup_split(self, split: str) -> Tuple[torch.Tensor, torch.Tensor]: def _apply_feature_extractor_per_run( self, features: List[np.ndarray], targets: List[np.ndarray] ) -> Tuple[List[np.ndarray], List[np.ndarray]]: - if self.feature_extractor is not None and self.window_size is not None: + if self.feature_extractor is not None: + features = [self.feature_extractor(f) for f in features] + if self.window_size is not None: cutoff = self.window_size - 1 - features = [self._apply_feature_extractor(f) for f in features] - # cut off because feats are re-windowed + features = [utils.extract_windows(f, self.window_size) for f in features] targets = [t[cutoff:] for t in targets] return features, targets - def _apply_feature_extractor(self, features: np.ndarray) -> np.ndarray: - features = self.feature_extractor(features) # type: ignore - features = utils.extract_windows(features, self.window_size) # type: ignore - - return features - def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: """ Create a [data loader][torch.utils.data.DataLoader] for the training split. diff --git a/tests/test_core.py b/tests/test_core.py index 570b41d..616ac84 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -11,151 +11,155 @@ from rul_datasets import core, reader -class TestRulDataModule(unittest.TestCase): - def setUp(self): - self.mock_loader = mock.MagicMock(name="AbstractLoader") - self.mock_loader.hparams = { - "test": 0, - "window_size": 30, - } - self.mock_runs = [np.zeros((1, 1, 1))], [np.zeros(1)] - self.mock_loader.load_split.return_value = self.mock_runs +@pytest.fixture() +def mock_runs(): + return [np.zeros((1, 1, 1))], [np.zeros(1)] - def test_created_correctly(self): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) - self.assertIs(self.mock_loader, dataset.reader) - self.assertEqual(16, dataset.batch_size) - self.assertDictEqual( - {"test": 0, "batch_size": 16, "window_size": 30, "feature_extractor": None}, - dataset.hparams, - ) +@pytest.fixture() +def mock_loader(mock_runs): + mock_reader = mock.MagicMock(reader.AbstractReader) + mock_reader.hparams = {"test": 0, "window_size": 30} + mock_reader.load_split.return_value = mock_runs + + return mock_reader + + +class TestRulDataModule: + def test_created_correctly(self, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) + + assert mock_loader is dataset.reader + assert 16 == dataset.batch_size + assert dataset.hparams == { + "test": 0, + "batch_size": 16, + "window_size": mock_loader.hparams["window_size"], + "feature_extractor": None, + } - def test_created_correctly_with_feature_extractor(self): + @pytest.mark.parametrize("window_size", [2, None]) + def test_created_correctly_with_feature_extractor(self, mock_loader, window_size): fe = lambda x: np.mean(x, axis=1) dataset = core.RulDataModule( - self.mock_loader, batch_size=16, feature_extractor=fe, window_size=2 + mock_loader, batch_size=16, feature_extractor=fe, window_size=window_size ) - self.assertIs(self.mock_loader, dataset.reader) - self.assertEqual(16, dataset.batch_size) - self.assertDictEqual( - { - "test": 0, - "batch_size": 16, - "window_size": 2, - "feature_extractor": str(fe), - }, - dataset.hparams, - ) + assert mock_loader is dataset.reader + assert 16 == dataset.batch_size + assert dataset.hparams == { + "test": 0, + "batch_size": 16, + "window_size": window_size or mock_loader.hparams["window_size"], + "feature_extractor": str(fe), + } - def test_prepare_data(self): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_prepare_data(self, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.prepare_data() - self.mock_loader.prepare_data.assert_called_once() + mock_loader.prepare_data.assert_called_once() - def test_setup(self): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_setup(self, mock_loader, mock_runs): + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() - self.mock_loader.load_split.assert_has_calls( + mock_loader.load_split.assert_has_calls( [mock.call("dev"), mock.call("val"), mock.call("test")] ) - mock_runs = tuple(torch.tensor(np.concatenate(r)) for r in self.mock_runs) - self.assertDictEqual( - {"dev": mock_runs, "val": mock_runs, "test": mock_runs}, dataset._data - ) + mock_runs = tuple(torch.tensor(np.concatenate(r)) for r in mock_runs) + assert dataset._data == {"dev": mock_runs, "val": mock_runs, "test": mock_runs} - def test_empty_dataset(self): - self.mock_loader.load_split.return_value = [], [] - dataset = core.RulDataModule(self.mock_loader, batch_size=4) + def test_empty_dataset(self, mock_loader): + """Should not crash on empty dataset.""" + mock_loader.load_split.return_value = [], [] + dataset = core.RulDataModule(mock_loader, batch_size=4) dataset.setup() @mock.patch( "rul_datasets.core.RulDataModule.to_dataset", return_value=TensorDataset(torch.zeros(1)), ) - def test_train_dataloader(self, mock_to_dataset): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_train_dataloader(self, mock_to_dataset, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() dataloader = dataset.train_dataloader() mock_to_dataset.assert_called_once_with("dev") - self.assertIs(mock_to_dataset.return_value, dataloader.dataset) - self.assertEqual(16, dataloader.batch_size) - self.assertIsInstance(dataloader.sampler, RandomSampler) - self.assertTrue(dataloader.pin_memory) + assert mock_to_dataset.return_value == dataloader.dataset + assert 16 == dataloader.batch_size + assert isinstance(dataloader.sampler, RandomSampler) + assert dataloader.pin_memory @mock.patch( "rul_datasets.core.RulDataModule.to_dataset", return_value=TensorDataset(torch.zeros(1)), ) - def test_val_dataloader(self, mock_to_dataset): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_val_dataloader(self, mock_to_dataset, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() dataloader = dataset.val_dataloader() mock_to_dataset.assert_called_once_with("val") - self.assertIs(mock_to_dataset.return_value, dataloader.dataset) - self.assertEqual(16, dataloader.batch_size) - self.assertIsInstance(dataloader.sampler, SequentialSampler) - self.assertTrue(dataloader.pin_memory) + assert mock_to_dataset.return_value is dataloader.dataset + assert 16 == dataloader.batch_size + assert isinstance(dataloader.sampler, SequentialSampler) + assert dataloader.pin_memory @mock.patch( "rul_datasets.core.RulDataModule.to_dataset", return_value=TensorDataset(torch.zeros(1)), ) - def test_test_dataloader(self, mock_to_dataset): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_test_dataloader(self, mock_to_dataset, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() dataloader = dataset.test_dataloader() mock_to_dataset.assert_called_once_with("test") - self.assertIs(mock_to_dataset.return_value, dataloader.dataset) - self.assertEqual(16, dataloader.batch_size) - self.assertIsInstance(dataloader.sampler, SequentialSampler) - self.assertTrue(dataloader.pin_memory) + assert mock_to_dataset.return_value is dataloader.dataset + assert 16 == dataloader.batch_size + assert isinstance(dataloader.sampler, SequentialSampler) + assert dataloader.pin_memory - def test_train_batch_structure(self): - self.mock_loader.load_split.return_value = ( + def test_train_batch_structure(self, mock_loader): + mock_loader.load_split.return_value = ( [np.zeros((8, 30, 14))] * 4, [np.zeros(8)] * 4, ) - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() train_loader = dataset.train_dataloader() self._assert_batch_structure(train_loader) - def test_val_batch_structure(self): - self.mock_loader.load_split.return_value = ( + def test_val_batch_structure(self, mock_loader): + mock_loader.load_split.return_value = ( [np.zeros((8, 30, 14))] * 4, [np.zeros(8)] * 4, ) - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() val_loader = dataset.val_dataloader() self._assert_batch_structure(val_loader) - def test_test_batch_structure(self): - self.mock_loader.load_split.return_value = ( + def test_test_batch_structure(self, mock_loader): + mock_loader.load_split.return_value = ( [np.zeros((8, 30, 14))] * 4, [np.zeros(8)] * 4, ) - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.setup() test_loader = dataset.test_dataloader() self._assert_batch_structure(test_loader) def _assert_batch_structure(self, loader): batch = next(iter(loader)) - self.assertEqual(2, len(batch)) + assert 2 == len(batch) features, labels = batch - self.assertEqual(torch.Size((16, 14, 30)), features.shape) - self.assertEqual(torch.Size((16,)), labels.shape) + assert torch.Size((16, 14, 30)) == features.shape + assert torch.Size((16,)) == labels.shape - def test_to_dataset(self): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_to_dataset(self, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) mock_data = { "dev": [torch.zeros(0)] * 2, "val": [torch.zeros(1)] * 2, @@ -165,62 +169,64 @@ def test_to_dataset(self): for i, split in enumerate(["dev", "val", "test"]): tensor_dataset = dataset.to_dataset(split) - self.assertIsInstance(tensor_dataset, TensorDataset) - self.assertEqual(i, len(tensor_dataset.tensors[0])) + assert isinstance(tensor_dataset, TensorDataset) + assert i == len(tensor_dataset.tensors[0]) - def test_check_compatability(self): + def test_check_compatability(self, mock_loader): fe = lambda x: np.mean(x, axis=2) - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + dataset = core.RulDataModule(mock_loader, batch_size=16) other = core.RulDataModule( - self.mock_loader, batch_size=16, feature_extractor=fe, window_size=2 + mock_loader, batch_size=16, feature_extractor=fe, window_size=2 ) dataset.check_compatibility(dataset) - self.mock_loader.check_compatibility.assert_called_once_with(self.mock_loader) - self.assertRaises( - ValueError, - dataset.check_compatibility, - core.RulDataModule(self.mock_loader, batch_size=8), - ) - self.assertRaises( - ValueError, - dataset.check_compatibility, - other, - ) - self.assertRaises( - ValueError, - other.check_compatibility, - core.RulDataModule( - self.mock_loader, batch_size=16, feature_extractor=fe, window_size=3 - ), - ) + mock_loader.check_compatibility.assert_called_once_with(mock_loader) + with pytest.raises(ValueError): + dataset.check_compatibility(core.RulDataModule(mock_loader, batch_size=8)) + with pytest.raises(ValueError): + dataset.check_compatibility(other) + with pytest.raises(ValueError): + other.check_compatibility( + core.RulDataModule( + mock_loader, batch_size=16, feature_extractor=fe, window_size=3 + ) + ) - def test_is_mutually_exclusive(self): - dataset = core.RulDataModule(self.mock_loader, batch_size=16) + def test_is_mutually_exclusive(self, mock_loader): + dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.is_mutually_exclusive(dataset) - self.mock_loader.is_mutually_exclusive.assert_called_once_with(dataset.reader) + mock_loader.is_mutually_exclusive.assert_called_once_with(dataset.reader) - def test_feature_extractor(self): - self.mock_loader.load_split.return_value = ( + def test_feature_extractor(self, mock_loader): + mock_loader.load_split.return_value = ( [np.zeros((8, 30, 14)) + np.arange(8)[:, None, None]], [torch.arange(8)], ) fe = lambda x: np.mean(x, axis=1) - dataset = core.RulDataModule( - self.mock_loader, - batch_size=16, - feature_extractor=fe, - window_size=2, + dataset = core.RulDataModule(mock_loader, 16, fe, window_size=2) + dataset.setup() + + dev_data = dataset.to_dataset("dev") + assert len(dev_data) == 7 + for i, (feat, targ) in enumerate(dev_data): + assert feat.shape == torch.Size([14, 2]) + assert torch.dist(torch.arange(i, i + 2)[None, :].repeat(14, 1), feat) == 0 + assert targ == i + 1 # targets start window_size + 1 steps later + + def test_feature_extractor_no_rewindowing(self, mock_loader): + mock_loader.load_split.return_value = ( + [np.zeros((8, 30, 14)) + np.arange(8)[:, None, None]], + [torch.arange(8)], ) + fe = lambda x: np.tile(x, (1, 2, 1)) # repeats window two times + dataset = core.RulDataModule(mock_loader, 16, fe, window_size=None) dataset.setup() dev_data = dataset.to_dataset("dev") - self.assertEqual(len(dev_data), 7) + assert len(dev_data) == 8 for i, (feat, targ) in enumerate(dev_data): - self.assertEqual(feat.shape, torch.Size([14, 2])) - self.assertTrue( - torch.dist(torch.arange(i, i + 2)[None, :].repeat(14, 1), feat) == 0 - ) - self.assertEqual(targ, i + 1) # targets start window_size + 1 steps later + assert feat.shape == torch.Size([14, 60]) + assert torch.dist(feat[:, :30], feat[:, 30:]) == 0.0 # fe applied correctly + assert targ == i class DummyRul(reader.AbstractReader):