diff --git a/.github/ISSUE_TEMPLATE/dataset-addition.yml b/.github/ISSUE_TEMPLATE/dataset-addition.yml index a297100..d5e13b1 100644 --- a/.github/ISSUE_TEMPLATE/dataset-addition.yml +++ b/.github/ISSUE_TEMPLATE/dataset-addition.yml @@ -20,7 +20,7 @@ body: description: | Please note that if the dataset's open-source implementation is not available, it'll take much more time to finish the implementation, so we are less likely to implement it in `TSDB`. options: - - label: "The dataset implementation is publicly available" + - label: "The dataset is publicly available" - type: textarea id: additional-info diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml index 62c581d..55f76dc 100644 --- a/.github/workflows/greetings.yml +++ b/.github/workflows/greetings.yml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/first-interaction@v1 with: - repo-token: ${{ secrets.ACCESS_TOKEN }} + repo-token: ${{ secrets.GITHUB_TOKEN }} issue-message: | Hi there 👋, diff --git a/tests/test_tsdb.py b/tests/test_tsdb.py index f09600a..44741cc 100644 --- a/tests/test_tsdb.py +++ b/tests/test_tsdb.py @@ -23,6 +23,7 @@ "pems_traffic", "solar_alabama", "ucr_uea_Wine", + "ucr_uea_FingerMovements", ] diff --git a/tsdb/__init__.py b/tsdb/__init__.py index 394f58e..9ed55fa 100644 --- a/tsdb/__init__.py +++ b/tsdb/__init__.py @@ -1,5 +1,6 @@ """ -TSDB (Time Series Data Beans): a Python toolbox to ease loading public time-series datasets. +TSDB (Time Series Data Beans): a Python toolbox loads hundreds of public time-series datasets for machine/deep learning +with a single line of code. """ # Created by Wenjie Du diff --git a/tsdb/loading_funcs/ucr_uea_datasets.py b/tsdb/loading_funcs/ucr_uea_datasets.py index 09896c3..37bc4b9 100644 --- a/tsdb/loading_funcs/ucr_uea_datasets.py +++ b/tsdb/loading_funcs/ucr_uea_datasets.py @@ -14,36 +14,29 @@ import os import warnings -import numpy +import numpy as np from sklearn.utils.estimator_checks import _NotAnArray as NotAnArray -try: - from scipy.io import arff - - HAS_ARFF = True -except Exception: - HAS_ARFF = False - def load_ucr_uea_dataset(local_path, dataset_name): try: # if both TXT and ARFF files are provided, the TXT versions are # used # both training and test data must be available in the same format - if _has_files(local_path, dataset_name, ext="txt"): - X_train, y_train = _load_txt_uea( - os.path.join(local_path, dataset_name + "_TRAIN.txt") - ) - X_test, y_test = _load_txt_uea( - os.path.join(local_path, dataset_name + "_TEST.txt") - ) - elif _has_files(local_path, dataset_name, ext="arff"): + if _has_files(local_path, dataset_name, ext="arff"): X_train, y_train = _load_arff_uea( os.path.join(local_path, dataset_name + "_TRAIN.arff") ) X_test, y_test = _load_arff_uea( os.path.join(local_path, dataset_name + "_TEST.arff") ) + elif _has_files(local_path, dataset_name, ext="txt"): + X_train, y_train = _load_txt_uea( + os.path.join(local_path, dataset_name + "_TRAIN.txt") + ) + X_test, y_test = _load_txt_uea( + os.path.join(local_path, dataset_name + "_TEST.txt") + ) else: warnings.warn( 'dataset "%s" is not provided in either TXT ' @@ -55,9 +48,9 @@ def load_ucr_uea_dataset(local_path, dataset_name): data = { "X_train": X_train, - "y_train": y_train.astype(float), + "y_train": y_train, "X_test": X_test, - "y_test": y_test.astype(float), + "y_test": y_test, } return data @@ -115,24 +108,24 @@ def ts_size(ts): Examples -------- - >>> ts_size([1, 2, 3, numpy.nan]) + >>> ts_size([1, 2, 3, np.nan]) 3 - >>> ts_size([1, numpy.nan]) + >>> ts_size([1, np.nan]) 1 - >>> ts_size([numpy.nan]) + >>> ts_size([np.nan]) 0 >>> ts_size([[1, 2], ... [2, 3], ... [3, 4], - ... [numpy.nan, 2], - ... [numpy.nan, numpy.nan]]) + ... [np.nan, 2], + ... [np.nan, np.nan]]) 4 - >>> ts_size([numpy.nan, 3, numpy.inf, numpy.nan]) + >>> ts_size([np.nan, 3, np.inf, np.nan]) 3 """ ts_ = to_time_series(ts) sz = ts_.shape[0] - while sz > 0 and numpy.all(numpy.isnan(ts_[sz - 1])): + while sz > 0 and np.all(np.isnan(ts_[sz - 1])): sz -= 1 return sz @@ -151,7 +144,7 @@ def to_time_series(ts, remove_nans=False): Returns ------- - numpy.ndarray of shape (sz, d) + np.ndarray of shape (sz, d) The transformed time series. This is always guaraneteed to be a new time series and never just a view into the old one. @@ -160,11 +153,11 @@ def to_time_series(ts, remove_nans=False): >>> to_time_series([1, 2]) array([[1.], [2.]]) - >>> to_time_series([1, 2, numpy.nan]) + >>> to_time_series([1, 2, np.nan]) array([[ 1.], [ 2.], [nan]]) - >>> to_time_series([1, 2, numpy.nan], remove_nans=True) + >>> to_time_series([1, 2, np.nan], remove_nans=True) array([[1.], [2.]]) @@ -172,7 +165,7 @@ def to_time_series(ts, remove_nans=False): -------- to_time_series_dataset : Transforms a dataset of time series """ - ts_out = numpy.array(ts, copy=True) + ts_out = np.array(ts, copy=True) if ts_out.ndim <= 1: ts_out = ts_out.reshape((-1, 1)) if ts_out.dtype != float: @@ -196,7 +189,7 @@ def to_time_series_dataset(dataset, dtype=float): Returns ------- - numpy.ndarray of shape (n_ts, sz, d) + np.ndarray of shape (n_ts, sz, d) The transformed dataset of time series. Examples @@ -226,79 +219,102 @@ def to_time_series_dataset(dataset, dtype=float): import pandas as pd if isinstance(dataset, pd.DataFrame): - return to_time_series_dataset(numpy.array(dataset)) + return to_time_series_dataset(np.array(dataset)) except ImportError: pass if isinstance(dataset, NotAnArray): # Patch to pass sklearn tests - return to_time_series_dataset(numpy.array(dataset)) + return to_time_series_dataset(np.array(dataset)) if len(dataset) == 0: - return numpy.zeros((0, 0, 0)) - if numpy.array(dataset[0]).ndim == 0: + return np.zeros((0, 0, 0)) + if np.array(dataset[0]).ndim == 0: dataset = [dataset] n_ts = len(dataset) max_sz = max([ts_size(to_time_series(ts, remove_nans=True)) for ts in dataset]) d = to_time_series(dataset[0]).shape[1] - dataset_out = numpy.zeros((n_ts, max_sz, d), dtype=dtype) + numpy.nan + dataset_out = np.zeros((n_ts, max_sz, d), dtype=dtype) + np.nan for i in range(n_ts): ts = to_time_series(dataset[i], remove_nans=True) dataset_out[i, : ts.shape[0]] = ts return dataset_out.astype(dtype) -def _load_arff_uea(dataset_path): - """Load arff file for uni/multi variate dataset +def _load_arff_uea( + full_file_path_and_name, + replace_missing_vals_with="NaN", +): + """Load data from a classification/regression WEKA arff file to a 3D np array. Parameters ---------- - dataset_path: string of dataset_path - Path to the ARFF file to be read + full_file_path_and_name: str + The full pathname of the .ts file to read. + replace_missing_vals_with: str + The value that missing values in the text file should be replaced + with prior to parsing. Returns ------- - x: numpy array of shape (n_timeseries, n_timestamps, n_features) - Time series dataset - y: numpy array of shape (n_timeseries, ) - Vector of targets - - Raises - ------ - ImportError: if the version of *Scipy* is too old (pre 1.3.0) - Exception: on any failure, e.g. if the given file does not exist or is - corrupted + data: np.ndarray + time series data, np.ndarray (n_cases, n_channels, n_timepoints) + y : np.ndarray of string or int + target variable """ - if not HAS_ARFF: - raise ImportError( - "scipy 1.3.0 or newer is required to load " - "time series datasets from arff format." - ) - data, meta = arff.loadarff(dataset_path) - names = meta.names() # ["input", "class"] for multi-variate - - # firstly get y_train - y_ = data[names[-1]] # data["class"] - y = numpy.array(y_).astype("str") - - # get x_train - if len(names) == 2: # len=2 => multi-variate - x_ = data[names[0]] - x_ = numpy.asarray(x_.tolist()) - - nb_example = x_.shape[0] - nb_channel = x_.shape[1] - length_one_channel = len(x_.dtype.descr) - x = numpy.empty([nb_example, length_one_channel, nb_channel]) - - for i in range(length_one_channel): - # x_.dtype.descr: [('t1', '