diff --git a/tests/test_utils.py b/tests/test_utils.py index 555f8c6..fca83cf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -49,13 +49,6 @@ def test_sine_generator(): assert np.max(ts) <= 2 and np.min(ts) >= -2 -def test_reconstruction_loss(): - original = np.array([[[0, 2], [1, 0], [1, 2]]]) - reconstructed = np.array([[[0.1, 1.5], [1.1, 0.1], [1, 2]]]) - - # TODO finalize - - def test_switch_generator(): Xs, ys = tsgm.utils.gen_sine_const_switch_dataset(10, 100, 20) @@ -193,9 +186,34 @@ def test_mmd_3_test(): assert pvalue < 1e-10 # the null hypothesis is rejected +@pytest.mark.parametrize("dataset_name", [ + "beef", + "coffee", + "ecg200", + "electric", + "freezer", + "gunpoint", + "insect", + "mixed_shapes", + "starlight", + "wafer" +]) +def test_ucr_loadable(dataset_name): + ucr_data_manager = tsgm.utils.UCRDataManager(ds=dataset_name) + X_train, y_train, X_test, y_test = ucr_data_manager.get() + assert X_train.shape[0] == y_train.shape[0] + assert X_test.shape[0] == y_test.shape[0] + + +def test_ucr_raises(): + with pytest.raises(ValueError) as excinfo: + ucr_data_manager = tsgm.utils.UCRDataManager(ds="does not exist") + assert "ds should be in" in str(excinfo.value) + + def test_get_wafer(): - DATASET = "wafer" - ucr_data_manager = tsgm.utils.UCRDataManager(ds=DATASET) + dataset = "wafer" + ucr_data_manager = tsgm.utils.UCRDataManager(ds=dataset) assert ucr_data_manager.summary() is None X_train, y_train, X_test, y_test = ucr_data_manager.get() assert X_train.shape == (1000, 152) @@ -215,3 +233,28 @@ def test_fix_random_seeds(): assert random.random() == 0.6394267984578837 assert np.random.random() == 0.3745401188473625 assert float(tf.random.uniform([1])[0]) == 0.6645621061325073 + + +def test_reconstruction_loss_by_axis(): + eps = 1e-8 + original = tf.constant([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]) + reconstructed = tf.constant([[[1.1, 2.2, 2.9], [3.9, 4.8, 6.1]]]) + loss = tsgm.utils.reconstruction_loss_by_axis(original, reconstructed) + assert abs(loss.numpy() - 0.119999886) < eps + loss = tsgm.utils.reconstruction_loss_by_axis(original, reconstructed, axis=1) + assert abs(loss.numpy()) < eps + loss = tsgm.utils.reconstruction_loss_by_axis(original, reconstructed, axis=2) + assert abs(loss.numpy() - 0.00444442) < eps + + +def test_get_physionet2012(): + train_X, train_y, test_X, test_y, val_X, val_y = tsgm.utils.get_physionet2012() + + assert train_X.shape == (1757980, 4) + assert train_y.shape == (4000, 6) + + assert test_X.shape == (1762535, 4) + assert test_y.shape == (4000, 6) + + assert val_X.shape == (1765303, 4) + assert val_y.shape == (4000, 6) diff --git a/tests/test_vae.py b/tests/test_vae.py index 3e38f02..78482a4 100644 --- a/tests/test_vae.py +++ b/tests/test_vae.py @@ -6,6 +6,34 @@ from tensorflow import keras +def test_vae(): + seq_len = 256 + feat_dim = 1 + latent_dim = 4 + + model_type = tsgm.models.architectures.zoo["vae_conv5"] + architecture = model_type(seq_len=seq_len, feat_dim=feat_dim, latent_dim=latent_dim) + + encoder, decoder = architecture.encoder, architecture.decoder + + X = tsgm.utils.gen_sine_dataset(100, seq_len, feat_dim, max_value=20) + + scaler = tsgm.utils.TSFeatureWiseScaler((0, 1)) + X = scaler.fit_transform(X).astype(np.float64) + + vae = tsgm.models.cvae.BetaVAE(encoder, decoder) + vae.compile(optimizer=keras.optimizers.Adam(0.0003)) + vae.fit(X, epochs=1, batch_size=128) + x_decoded = vae.predict([X]) + assert x_decoded.shape == X.shape + + x_samples = vae.generate(7) + assert x_samples.shape == (7, seq_len, feat_dim) + + x_decoded = vae([X]) + assert x_decoded.shape == X.shape + + def test_cvae(): seq_len = 256 feat_dim = 1 @@ -15,7 +43,7 @@ def test_cvae(): model_type = tsgm.models.architectures.zoo["cvae_conv5"] architecture = model_type(seq_len=seq_len, feat_dim=feat_dim, latent_dim=latent_dim, output_dim=2) - encoder, decoder = architecture._encoder, architecture._decoder + encoder, decoder = architecture.encoder, architecture.decoder X, y_i = tsgm.utils.gen_sine_vs_const_dataset(100, seq_len, feat_dim, max_value=20, const=10) @@ -23,14 +51,22 @@ def test_cvae(): X = scaler.fit_transform(X).astype(np.float64) y = keras.utils.to_categorical(y_i, output_dim).astype(np.float64) + cbk = tsgm.models.monitors.VAEMonitor( + num_samples=1, latent_dim=latent_dim, output_dim=2) + vae = tsgm.models.cvae.cBetaVAE(encoder, decoder, latent_dim=latent_dim, temporal=False) vae.compile(optimizer=keras.optimizers.Adam(0.0003)) - vae.fit(X, y, epochs=1, batch_size=128) - + vae.fit(X, y, epochs=1, batch_size=128, callbacks=[cbk]) x_decoded = vae.predict([X, y]) assert x_decoded.shape == X.shape + x_samples, y_samples = vae.generate(y[:7]) + assert x_samples.shape == (7, seq_len, feat_dim) + + x_decoded = vae([X, y]) + assert x_decoded.shape == X.shape + def test_temp_cvae(): seq_len = 256 @@ -53,7 +89,7 @@ def test_temp_cvae(): dataset = tf.data.Dataset.from_tensor_slices((X_train, y)) dataset = dataset.shuffle(buffer_size=1024).batch(batch_size) - encoder, decoder = architecture._encoder, architecture._decoder + encoder, decoder = architecture.encoder, architecture.decoder vae = tsgm.models.cvae.cBetaVAE(encoder, decoder, latent_dim=latent_dim, temporal=True) vae.compile(optimizer=keras.optimizers.Adam(0.0003)) diff --git a/tests/test_visualizations.py b/tests/test_visualizations.py index 5cb288f..36033c2 100644 --- a/tests/test_visualizations.py +++ b/tests/test_visualizations.py @@ -9,7 +9,10 @@ def test_visualize_dataset(): tsgm.utils.visualize_dataset(Xs) -def test_visualize_tsne_unlabeled(): +@pytest.mark.parametrize("feature_averaging", [ + True, False +]) +def test_visualize_tsne_unlabeled(feature_averaging): Xs = np.array([ [[1, 2, 3], [3, 4, 5]], [[1, 2, 3], [3, 4, 5]], @@ -19,7 +22,7 @@ def test_visualize_tsne_unlabeled(): ]) Xgen = Xs ys = np.ones((Xs.shape[0], 1)) - tsgm.utils.visualize_tsne_unlabeled(Xs, Xgen, perplexity=2) + tsgm.utils.visualize_tsne_unlabeled(Xs, Xgen, perplexity=2, feature_averaging=feature_averaging) def test_visualize_tsne(): @@ -41,11 +44,30 @@ def test_visualize_ts(): tsgm.utils.visualize_ts(Xs, num=1) -def test_visualize_ts_lineplot(): +@pytest.mark.parametrize("unite_features", [ + True, False +]) +def test_visualize_ts_lineplot(unite_features): Xs = np.array([[[1, 2, 3], [3, 4, 5]]]) - tsgm.utils.visualize_ts_lineplot(Xs, num=1) + tsgm.utils.visualize_ts_lineplot(Xs, num=1, unite_features=unite_features) + ys = np.array([1, 2]) + tsgm.utils.visualize_ts_lineplot(Xs, ys, num=1, unite_features=unite_features) -def visualize_training_loss(): - loss = np.array([10, 9, 8, 7]) + +def test_visualize_training_loss(): + loss = np.array([[10.0], [9.0], [8.0], [7.0]]) tsgm.utils.visualize_training_loss(loss) + + +def test_visualize_original_and_reconst_ts(): + original = np.array([ + [[1, 2, 3], [3, 4, 5]], + [[1, 2, 3], [3, 4, 5]], + [[1, 2, 3], [3, 4, 5]], + [[1, 2, 3], [3, 4, 5]], + [[1, 2, 3], [3, 4, 5]] + ]) + reconstructed = original + tsgm.utils.visualize_original_and_reconst_ts(original, reconstructed) + diff --git a/tests/test_zoo.py b/tests/test_zoo.py index ca6d74e..cc57ffe 100644 --- a/tests/test_zoo.py +++ b/tests/test_zoo.py @@ -1,29 +1,79 @@ import pytest + +import functools +import numpy as np +import random import tensorflow as tf +from tensorflow.keras import layers + +import sklearn.metrics.pairwise import tsgm -def test_zoo(): - assert isinstance(tsgm.models.zoo, tsgm.models.architectures.Zoo) - assert len(tsgm.models.zoo.keys()) == len(tsgm.models.zoo.values()) +@pytest.mark.parametrize("model_type", [ + tsgm.models.architectures.zoo["cvae_conv5"], +]) +def test_zoo_cvae(model_type): + seq_len = 10 + feat_dim = 2 + latent_dim = 1 + output_dim = 1 + + arch = model_type(seq_len=seq_len, feat_dim=feat_dim, latent_dim=latent_dim, output_dim=output_dim) + arch_dict = arch.get() + + assert arch.encoder == arch_dict["encoder"] and arch.decoder == arch_dict["decoder"] + - assert tsgm.models.zoo.summary() is None +@pytest.mark.parametrize("model_type", [ + tsgm.models.architectures.zoo["cgan_base_c4_l1"], + tsgm.models.architectures.zoo["cgan_lstm_n"] +]) +def test_zoo_cgan(model_type): + seq_len = 10 + feat_dim = 2 + latent_dim = 1 + output_dim = 1 + + arch = model_type( + seq_len=seq_len, feat_dim=feat_dim, + latent_dim=latent_dim, output_dim=output_dim) + arch_dict = arch.get() - assert isinstance(tsgm.models.zoo, dict) + assert arch.generator == arch_dict["generator"] and arch.discriminator == arch_dict["discriminator"] - with pytest.raises(TypeError): - result = tsgm.models.architectures.BaseGANArchitecture() - with pytest.raises(TypeError): - result = tsgm.models.architectures.BaseVAEArchitecture() +@pytest.mark.parametrize("model_type_name", [ + "clf_cn", + "clf_cl_n", + "clf_block"], +) +def test_zoo_clf(model_type_name): + seq_len = 10 + feat_dim = 2 + output_dim = 1 + model_type = tsgm.models.architectures.zoo[model_type_name] + if model_type_name == "clf_block": + arch = model_type( + seq_len=seq_len, feat_dim=feat_dim, output_dim=output_dim, blocks=[layers.Conv1D(filters=64, kernel_size=3, activation="relu")]) + else: + arch = model_type( + seq_len=seq_len, feat_dim=feat_dim, output_dim=output_dim) + arch_dict = arch.get() -def test_sampling(): - input_sampling = [0.0, 1.0] - result = tsgm.models.architectures.Sampling()(input_sampling) - assert isinstance(result, tf.Tensor) + assert arch.model == arch_dict["model"] -def test_dict_types(): - for k, v in tsgm.models.zoo.items(): - assert issubclass(v, tsgm.models.architectures.Architecture) +def test_basic_rec(): + seq_len = 10 + feat_dim = 2 + output_dim = 1 + + arch = tsgm.models.zoo["recurrent"]( + hidden_dim=2, + output_dim=output_dim, + n_layers=1, + network_type="gru") + model = arch.build() + assert model is not None diff --git a/tsgm/models/cvae.py b/tsgm/models/cvae.py index 29dbd81..5612a6a 100644 --- a/tsgm/models/cvae.py +++ b/tsgm/models/cvae.py @@ -100,8 +100,7 @@ def generate(self, n: int) -> tsgm.types.Tensor: class cBetaVAE(keras.Model): - # TODO: allow using architecture or encoder & decoder - def __init__(self, encoder, decoder, latent_dim, temporal, beta=1.0, **kwargs): + def __init__(self, encoder, decoder, latent_dim, temporal: bool, beta=1.0, **kwargs): super(cBetaVAE, self).__init__(**kwargs) self.beta = beta self.encoder = encoder @@ -137,7 +136,7 @@ def generate(self, labels: tsgm.types.Tensor) -> tuple: :returns: a tuple of synthetically generated data and labels. """ batch_size = tf.shape(labels)[0] - z = tf.random.normal((batch_size, self._seq_len, self.latent_dim)) + z = tf.random.normal((batch_size, self._seq_len, self.latent_dim), dtype=labels.dtype) decoder_input = self._get_decoder_input(z, labels) return (self.decoder(decoder_input), labels) diff --git a/tsgm/models/monitors.py b/tsgm/models/monitors.py index c094450..d160bfa 100644 --- a/tsgm/models/monitors.py +++ b/tsgm/models/monitors.py @@ -17,7 +17,7 @@ class GANMonitor(keras.callbacks.Callback): def __init__(self, num_samples: int, latent_dim: int, labels: tsgm.types.Tensor, - save: bool = True, save_path: typing.Optional[str] = None, mode: str = "clf"): + save: bool = True, save_path: typing.Optional[str] = None, mode: str = "clf") -> None: self._num_samples = num_samples self._latent_dim = latent_dim self._save = save @@ -37,7 +37,7 @@ def __init__(self, num_samples: int, latent_dim: int, labels: tsgm.types.Tensor, logger.warning("save_path is specified, but save is False.") os.makedirs(self._save_path, exist_ok=True) - def on_epoch_end(self, epoch, logs=None): + def on_epoch_end(self, epoch, logs=None) -> None: if self._mode in ["clf", "reg"]: random_latent_vectors = tf.random.normal(shape=(self._num_samples, self._latent_dim)) elif self._mode == "temporal": @@ -64,14 +64,24 @@ def on_epoch_end(self, epoch, logs=None): class VAEMonitor(keras.callbacks.Callback): - def __init__(self, num_samples=6, latent_dim=128, output_dim=2): + def __init__(self, num_samples: int = 6, latent_dim: int = 128, output_dim: int = 2, + save: bool = True, save_path: typing.Optional[str] = None) -> None: self._num_samples = num_samples self._latent_dim = latent_dim self._output_dim = output_dim + self._save = save + self._save_path = save_path - def on_epoch_end(self, epoch, logs=None): - random_latent_vectors = tf.random.normal(shape=(self._output_dim * self._num_samples, self._latent_dim)) + if self._save and self._save_path is None: + self._save_path = "/tmp/" + logger.warning("save_path is not specified. Using `/tmp` as the default save_path") + + if self._save_path is not None: + if self._save is False: + logger.warning("save_path is specified, but save is False.") + os.makedirs(self._save_path, exist_ok=True) + def on_epoch_end(self, epoch, logs=None) -> None: labels = [] for i in range(self._output_dim): if not len(labels): @@ -80,8 +90,11 @@ def on_epoch_end(self, epoch, logs=None): labels = tf.concat((labels, keras.utils.to_categorical([i], self._output_dim)), 0) labels = tf.repeat(labels, self._num_samples, axis=0) - generated_images = self.model.decoder(tf.concat([random_latent_vectors, labels], 1)) + generated_images, _ = self.model.generate(labels) for i in range(self._output_dim * self._num_samples): sns.lineplot(x=range(0, generated_images[i].shape[0]), y=tf.squeeze(generated_images[i])) - plt.show() + if self._save: + plt.savefig(os.path.join(self._save_path, "epoch_{}_sample_{}".format(epoch, i))) + else: + plt.show() diff --git a/tsgm/utils/datasets.py b/tsgm/utils/datasets.py index 8aa425f..893b402 100644 --- a/tsgm/utils/datasets.py +++ b/tsgm/utils/datasets.py @@ -298,8 +298,8 @@ def download_physionet2012(): Downloads the Physionet 2012 dataset files from the Physionet website and extracts them in local folder 'physionet2012' """ - _base_url = "https://physionet.org/files/challenge-2012/1.0.0/" - _destination_folder = "physionet2012" + base_url = "https://physionet.org/files/challenge-2012/1.0.0/" + destination_folder = "physionet2012" X_a = "set-a.tar.gz" y_a = "Outcomes-a.txt" @@ -309,18 +309,14 @@ def download_physionet2012(): X_c = "set-c.tar.gz" y_c = "Outcomes-c.txt" - file_utils.download(_base_url + X_a, _destination_folder) - file_utils.download(_base_url + y_a, _destination_folder) - file_utils.download(_base_url + X_b, _destination_folder) - file_utils.download(_base_url + y_b, _destination_folder) - file_utils.download(_base_url + X_c, _destination_folder) - file_utils.download(_base_url + y_c, _destination_folder) + all_files = [(X_a, y_a), (X_b, y_b), (X_c, y_c)] - file_utils.extract_archive(_destination_folder + X_a, _destination_folder) - file_utils.extract_archive(_destination_folder + X_b, _destination_folder) - file_utils.extract_archive(_destination_folder + X_c, _destination_folder) + for X, y in all_files: + file_utils.download(base_url + X, destination_folder) + file_utils.download(base_url + y, destination_folder) - return + for X, y in all_files: + file_utils.extract_archive(os.path.join(destination_folder, X), destination_folder) def _get_physionet_X_dataframe(dataset_path: str) -> pd.DataFrame: diff --git a/tsgm/utils/utils.py b/tsgm/utils/utils.py index f45274f..e0b8f0c 100644 --- a/tsgm/utils/utils.py +++ b/tsgm/utils/utils.py @@ -1,6 +1,5 @@ import random import numpy as np -import numpy.typing as npt import tensorflow as tf @@ -60,34 +59,6 @@ def reconstruction_loss_by_axis(original: tf.Tensor, reconstructed: tf.Tensor, a return tf.losses.mean_squared_error(tf.reduce_mean(original, axis=axis), tf.reduce_mean(reconstructed, axis=axis)) -def generate_slices(X: npt.NDArray, slice_len: int = 10) -> npt.NDArray: - """ - Generate slices of a time series dataset. - - This function takes a time series dataset X - and generates slices of each sequence with a specified slice length. - - Parameters: - ---------- - X : list or numpy.ndarray - The input list or array of sequences to be sliced. - - slice_len : int, optional (default=10) - The desired slice length for each sequence. - - Returns: - ------- - numpy.ndarray - A numpy array containing the sliced sequences. Each row of the array represents - a slice of a sequence from the input. - """ - new_X = [] - for el in X: - for i in range(0, len(el) - slice_len, slice_len): - new_X.append(el[i : i + slice_len]) - return np.array(new_X) - - def fix_seeds(seed_value: int = 42) -> None: """ Fix random number generator seeds for reproducibility. diff --git a/tsgm/utils/visualization.py b/tsgm/utils/visualization.py index 44e7892..1380fa0 100644 --- a/tsgm/utils/visualization.py +++ b/tsgm/utils/visualization.py @@ -295,7 +295,7 @@ def visualize_training_loss( """ num_of_metrics = loss_vector.shape[0] num_of_epochs = loss_vector[0].shape[0] - _colors = [ + colors = [ {"color": "orange", "linewidth": 1, "alpha": 0.8}, {"color": "darkorchid"}, {"color": "pink"}, @@ -306,24 +306,24 @@ def visualize_training_loss( ] fig, ax = plt.subplots(1, 1, figsize=(12, 5)) for i in range(num_of_metrics): - _label = labels[i] if i < len(labels) else None - _loss = loss_vector[i] + label = labels[i] if i < len(labels) else "" + loss = loss_vector[i] # scale loss to be in range [0, 0.xxx] - _max_magnitude = math.floor(math.log10(np.max(_loss))) - if _max_magnitude >= 0: - _exp = _max_magnitude + 1 - _loss /= 10 ** _exp - _label += f" ($10^{_exp}$)" + max_magnitude = math.floor(math.log10(np.max(loss))) + if max_magnitude >= 0: + exp = max_magnitude + 1 + loss /= 10 ** exp + label += f" ($10^{exp}$)" - if i < len(_colors): + if i < len(colors): # use custom styles until a style is defined - ax.plot(range(num_of_epochs), _loss, label=_label, **_colors[i]) + ax.plot(range(num_of_epochs), loss, label=label, **colors[i]) else: ax.plot( range(num_of_epochs), - _loss, - label=_label, + loss, + label=label, ) plt.legend()