From 3f07b2a9ca84a8e4c37b19ba3b089da0a58e4664 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Wed, 31 Jul 2024 22:12:37 +0800 Subject: [PATCH 1/5] fix: load datasets in arff from ucr_uea; --- tsdb/loading_funcs/ucr_uea_datasets.py | 179 ++++++++++++++----------- 1 file changed, 101 insertions(+), 78 deletions(-) diff --git a/tsdb/loading_funcs/ucr_uea_datasets.py b/tsdb/loading_funcs/ucr_uea_datasets.py index 09896c3..611651d 100644 --- a/tsdb/loading_funcs/ucr_uea_datasets.py +++ b/tsdb/loading_funcs/ucr_uea_datasets.py @@ -14,7 +14,7 @@ import os import warnings -import numpy +import numpy as np from sklearn.utils.estimator_checks import _NotAnArray as NotAnArray try: @@ -30,20 +30,20 @@ def load_ucr_uea_dataset(local_path, dataset_name): # if both TXT and ARFF files are provided, the TXT versions are # used # both training and test data must be available in the same format - if _has_files(local_path, dataset_name, ext="txt"): - X_train, y_train = _load_txt_uea( - os.path.join(local_path, dataset_name + "_TRAIN.txt") - ) - X_test, y_test = _load_txt_uea( - os.path.join(local_path, dataset_name + "_TEST.txt") - ) - elif _has_files(local_path, dataset_name, ext="arff"): + if _has_files(local_path, dataset_name, ext="arff"): X_train, y_train = _load_arff_uea( os.path.join(local_path, dataset_name + "_TRAIN.arff") ) X_test, y_test = _load_arff_uea( os.path.join(local_path, dataset_name + "_TEST.arff") ) + elif _has_files(local_path, dataset_name, ext="txt"): + X_train, y_train = _load_txt_uea( + os.path.join(local_path, dataset_name + "_TRAIN.txt") + ) + X_test, y_test = _load_txt_uea( + os.path.join(local_path, dataset_name + "_TEST.txt") + ) else: warnings.warn( 'dataset "%s" is not provided in either TXT ' @@ -55,9 +55,9 @@ def load_ucr_uea_dataset(local_path, dataset_name): data = { "X_train": X_train, - "y_train": y_train.astype(float), + "y_train": y_train, "X_test": X_test, - "y_test": y_test.astype(float), + "y_test": y_test, } return data @@ -115,24 +115,24 @@ def ts_size(ts): Examples -------- - >>> ts_size([1, 2, 3, numpy.nan]) + >>> ts_size([1, 2, 3, np.nan]) 3 - >>> ts_size([1, numpy.nan]) + >>> ts_size([1, np.nan]) 1 - >>> ts_size([numpy.nan]) + >>> ts_size([np.nan]) 0 >>> ts_size([[1, 2], ... [2, 3], ... [3, 4], - ... [numpy.nan, 2], - ... [numpy.nan, numpy.nan]]) + ... [np.nan, 2], + ... [np.nan, np.nan]]) 4 - >>> ts_size([numpy.nan, 3, numpy.inf, numpy.nan]) + >>> ts_size([np.nan, 3, np.inf, np.nan]) 3 """ ts_ = to_time_series(ts) sz = ts_.shape[0] - while sz > 0 and numpy.all(numpy.isnan(ts_[sz - 1])): + while sz > 0 and np.all(np.isnan(ts_[sz - 1])): sz -= 1 return sz @@ -151,7 +151,7 @@ def to_time_series(ts, remove_nans=False): Returns ------- - numpy.ndarray of shape (sz, d) + np.ndarray of shape (sz, d) The transformed time series. This is always guaraneteed to be a new time series and never just a view into the old one. @@ -160,11 +160,11 @@ def to_time_series(ts, remove_nans=False): >>> to_time_series([1, 2]) array([[1.], [2.]]) - >>> to_time_series([1, 2, numpy.nan]) + >>> to_time_series([1, 2, np.nan]) array([[ 1.], [ 2.], [nan]]) - >>> to_time_series([1, 2, numpy.nan], remove_nans=True) + >>> to_time_series([1, 2, np.nan], remove_nans=True) array([[1.], [2.]]) @@ -172,7 +172,7 @@ def to_time_series(ts, remove_nans=False): -------- to_time_series_dataset : Transforms a dataset of time series """ - ts_out = numpy.array(ts, copy=True) + ts_out = np.array(ts, copy=True) if ts_out.ndim <= 1: ts_out = ts_out.reshape((-1, 1)) if ts_out.dtype != float: @@ -196,7 +196,7 @@ def to_time_series_dataset(dataset, dtype=float): Returns ------- - numpy.ndarray of shape (n_ts, sz, d) + np.ndarray of shape (n_ts, sz, d) The transformed dataset of time series. Examples @@ -226,79 +226,102 @@ def to_time_series_dataset(dataset, dtype=float): import pandas as pd if isinstance(dataset, pd.DataFrame): - return to_time_series_dataset(numpy.array(dataset)) + return to_time_series_dataset(np.array(dataset)) except ImportError: pass if isinstance(dataset, NotAnArray): # Patch to pass sklearn tests - return to_time_series_dataset(numpy.array(dataset)) + return to_time_series_dataset(np.array(dataset)) if len(dataset) == 0: - return numpy.zeros((0, 0, 0)) - if numpy.array(dataset[0]).ndim == 0: + return np.zeros((0, 0, 0)) + if np.array(dataset[0]).ndim == 0: dataset = [dataset] n_ts = len(dataset) max_sz = max([ts_size(to_time_series(ts, remove_nans=True)) for ts in dataset]) d = to_time_series(dataset[0]).shape[1] - dataset_out = numpy.zeros((n_ts, max_sz, d), dtype=dtype) + numpy.nan + dataset_out = np.zeros((n_ts, max_sz, d), dtype=dtype) + np.nan for i in range(n_ts): ts = to_time_series(dataset[i], remove_nans=True) dataset_out[i, : ts.shape[0]] = ts return dataset_out.astype(dtype) -def _load_arff_uea(dataset_path): - """Load arff file for uni/multi variate dataset +def _load_arff_uea( + full_file_path_and_name, + replace_missing_vals_with="NaN", +): + """Load data from a classification/regression WEKA arff file to a 3D np array. Parameters ---------- - dataset_path: string of dataset_path - Path to the ARFF file to be read + full_file_path_and_name: str + The full pathname of the .ts file to read. + replace_missing_vals_with: str + The value that missing values in the text file should be replaced + with prior to parsing. Returns ------- - x: numpy array of shape (n_timeseries, n_timestamps, n_features) - Time series dataset - y: numpy array of shape (n_timeseries, ) - Vector of targets - - Raises - ------ - ImportError: if the version of *Scipy* is too old (pre 1.3.0) - Exception: on any failure, e.g. if the given file does not exist or is - corrupted + data: np.ndarray + time series data, np.ndarray (n_cases, n_channels, n_timepoints) + y : np.ndarray of string or int + target variable """ - if not HAS_ARFF: - raise ImportError( - "scipy 1.3.0 or newer is required to load " - "time series datasets from arff format." - ) - data, meta = arff.loadarff(dataset_path) - names = meta.names() # ["input", "class"] for multi-variate - - # firstly get y_train - y_ = data[names[-1]] # data["class"] - y = numpy.array(y_).astype("str") - - # get x_train - if len(names) == 2: # len=2 => multi-variate - x_ = data[names[0]] - x_ = numpy.asarray(x_.tolist()) - - nb_example = x_.shape[0] - nb_channel = x_.shape[1] - length_one_channel = len(x_.dtype.descr) - x = numpy.empty([nb_example, length_one_channel, nb_channel]) - - for i in range(length_one_channel): - # x_.dtype.descr: [('t1', ' Date: Wed, 31 Jul 2024 22:26:18 +0800 Subject: [PATCH 2/5] refactor: update workflows; --- .github/ISSUE_TEMPLATE/dataset-addition.yml | 2 +- .github/workflows/greetings.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/dataset-addition.yml b/.github/ISSUE_TEMPLATE/dataset-addition.yml index a297100..d5e13b1 100644 --- a/.github/ISSUE_TEMPLATE/dataset-addition.yml +++ b/.github/ISSUE_TEMPLATE/dataset-addition.yml @@ -20,7 +20,7 @@ body: description: | Please note that if the dataset's open-source implementation is not available, it'll take much more time to finish the implementation, so we are less likely to implement it in `TSDB`. options: - - label: "The dataset implementation is publicly available" + - label: "The dataset is publicly available" - type: textarea id: additional-info diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml index 62c581d..55f76dc 100644 --- a/.github/workflows/greetings.yml +++ b/.github/workflows/greetings.yml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/first-interaction@v1 with: - repo-token: ${{ secrets.ACCESS_TOKEN }} + repo-token: ${{ secrets.GITHUB_TOKEN }} issue-message: | Hi there 👋, From 4cdea39dc8035c99f614b3f1c939f1e80d00604d Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Wed, 31 Jul 2024 22:39:20 +0800 Subject: [PATCH 3/5] feat: release v0.6.2; --- tsdb/__init__.py | 3 ++- tsdb/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tsdb/__init__.py b/tsdb/__init__.py index 394f58e..9ed55fa 100644 --- a/tsdb/__init__.py +++ b/tsdb/__init__.py @@ -1,5 +1,6 @@ """ -TSDB (Time Series Data Beans): a Python toolbox to ease loading public time-series datasets. +TSDB (Time Series Data Beans): a Python toolbox loads hundreds of public time-series datasets for machine/deep learning +with a single line of code. """ # Created by Wenjie Du diff --git a/tsdb/version.py b/tsdb/version.py index d0fad9a..4c957b2 100644 --- a/tsdb/version.py +++ b/tsdb/version.py @@ -21,4 +21,4 @@ # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -__version__ = "0.6.1" +__version__ = "0.6.2" From e38d32e41e0dddb5487660f639377aa3043bdb92 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Wed, 31 Jul 2024 23:21:12 +0800 Subject: [PATCH 4/5] refactor: fix linting error; --- tsdb/loading_funcs/ucr_uea_datasets.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tsdb/loading_funcs/ucr_uea_datasets.py b/tsdb/loading_funcs/ucr_uea_datasets.py index 611651d..37bc4b9 100644 --- a/tsdb/loading_funcs/ucr_uea_datasets.py +++ b/tsdb/loading_funcs/ucr_uea_datasets.py @@ -17,13 +17,6 @@ import numpy as np from sklearn.utils.estimator_checks import _NotAnArray as NotAnArray -try: - from scipy.io import arff - - HAS_ARFF = True -except Exception: - HAS_ARFF = False - def load_ucr_uea_dataset(local_path, dataset_name): try: From ad4e1fb2cbb34c87904bbeb576ba683ce9e13f91 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sat, 3 Aug 2024 19:50:25 +0800 Subject: [PATCH 5/5] test: test loading ucr_uea_FingerMovements which is in arff; --- tests/test_tsdb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tsdb.py b/tests/test_tsdb.py index f09600a..44741cc 100644 --- a/tests/test_tsdb.py +++ b/tests/test_tsdb.py @@ -23,6 +23,7 @@ "pems_traffic", "solar_alabama", "ucr_uea_Wine", + "ucr_uea_FingerMovements", ]