diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a55a9bb..bb2982c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,7 +37,7 @@ jobs: strategy: matrix: os: [ubuntu-20.04, macOS-11] - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 @@ -67,11 +67,6 @@ jobs: matrix: include: # Window 64 bit - - os: windows-2019 - python: 37 - python-version: '3.7' - bitness: 64 - platform_id: win_amd64 - os: windows-2019 python: 38 python-version: '3.8' diff --git a/README.md b/README.md index 8a07a06..27da1ab 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # This package is moving to the aeon-toolkit. -Starting from v0.2.7, this package will not be updated. You can already find an updated version of RDST in the Aeon package at https://github.com/aeon-toolkit/aeon/blob/main/aeon/transformations/panel/dilated_shapelet_transform.py . This version should allow for faster transformation times due to using online normalization and better normalized distance computation, it uses the euclidean distance by default instead of the manhattan distance in convst. All the functionnalities of this package will be ported into Aeon when I got some time, for now, only the transformer for univariate and multivariate series of even length have been implemented. +Starting from v0.3.0, this package will not be updated, bugfixes will still be included if issues are raised. +You can already find an updated version of RDST in the Aeon package at https://github.com/aeon-toolkit/ . Further improvements are planned for further speeding up RDST, these improvement will only be implemented in aeon. +All the functionnalities of this package will be ported into Aeon when I got some time, for now, only the transformer for univariate and multivariate series of even length have been implemented. # Readme Welcome to the convst repository. It contains the implementation of the `Random Dilated Shapelet Transform (RDST)` along with other works in the same area. @@ -34,17 +36,18 @@ We recommend doing this in a new virtual environment using anaconda to avoid any An optional dependency that can help speed up numba, which is used in our implementation, is the Intel vector math library (SVML). When using conda it can be installed by running `conda install -c numba icc_rt`. I didn't test the behavior with AMD processors, but I suspect it won't work. -If you are using RDST in some specific settings such as an HPC cluster and are getting errors, take a loot at [issue #24](https://github.com/baraline/convst/issues/24), you may need to change the numba compilation settings to not using function caching (see [this example](https://github.com/baraline/convst/blob/main/examples/Changing_numba_options.py)). +If you are using RDST in some specific settings such as an HPC cluster and are getting errors, take a loot at [issue #24](https://github.com/baraline/convst/issues/24), you may need to change the numba compilation settings to not using function caching (see [this example](https://github.com/baraline/convst/blob/main/examples/Changing_numba_options.py)). THIS SHOULD BE FIXED WITH v0.3.0 + ## Tutorial -We give here a minimal example to run the `RDST` algorithm on any dataset of the UCR archive using the sktime API to get datasets: +We give here a minimal example to run the `RDST` algorithm on any dataset of the UCR archive using the aeon API to get datasets: ```python from convst.classifiers import R_DST_Ridge -from convst.utils.dataset_utils import load_sktime_dataset_split +from convst.utils.dataset_utils import load_UCR_UEA_dataset_split -X_train, X_test, y_train, y_test, _ = load_sktime_dataset_split('GunPoint') +X_train, X_test, y_train, y_test, _ = load_UCR_UEA_dataset_split('GunPoint') # First run may be slow due to numba compilations on the first call. # Run a small dataset like GunPoint if this is the first time you call RDST on your system. diff --git a/convst/__init__.py b/convst/__init__.py index 7fd2bc0..974d655 100644 --- a/convst/__init__.py +++ b/convst/__init__.py @@ -1,6 +1,6 @@ __author__ = 'Antoine Guillaume antoine.guillaume45@gmail.com' -__version__ = "0.2.7" +__version__ = "0.3.0" __all__ = ['transformers', 'classifiers', 'utils', 'interpreters'] diff --git a/convst/classifiers/rdst_ensemble.py b/convst/classifiers/rdst_ensemble.py index e072167..5580582 100644 --- a/convst/classifiers/rdst_ensemble.py +++ b/convst/classifiers/rdst_ensemble.py @@ -76,10 +76,6 @@ class R_DST_Ensemble(BaseEstimator, ClassifierMixin): phase_invariance : bool, optional Wheter to use phase invariance for shapelet sampling and distance computation. The default is False. - distance : str, optional - The distance function to use whe computing distances between shapelets - and time series. Choose between 'euclidean','manhattan' and 'squared_euclidean'. - The default is 'manhattan'. alpha : float, optional The alpha similarity parameter, the higher the value, the lower the allowed number of common indexes with previously sampled shapelets @@ -159,7 +155,6 @@ def __init__( phase_invariance=False, input_transformers=None, transform_type='auto', - distance='manhattan', normalize_output=False, percentiles=[5,10], max_channels=None, @@ -170,7 +165,6 @@ def __init__( ): self.transform_type = transform_type self.phase_invariance = check_is_boolean(phase_invariance) - self.distance = distance self.normalize_output = check_is_boolean(normalize_output) self.n_samples = check_is_numeric(n_samples) if n_samples is not None else n_samples self.shapelet_lengths_bounds = shapelet_lengths_bounds diff --git a/convst/classifiers/rdst_ridge.py b/convst/classifiers/rdst_ridge.py index e0a5b21..9d1bfdf 100644 --- a/convst/classifiers/rdst_ridge.py +++ b/convst/classifiers/rdst_ridge.py @@ -38,10 +38,6 @@ class R_DST_Ridge(BaseEstimator, ClassifierMixin): phase_invariance : bool, optional Wheter to use phase invariance for shapelet sampling and distance computation. The default is False. - distance : str, optional - The distance function to use whe computing distances between shapelets - and time series. Choose between 'euclidean','manhattan' and 'squared_euclidean'. - The default is 'manhattan'. alpha : float, optional The alpha similarity parameter, the higher the value, the lower the allowed number of common indexes with previously sampled shapelets @@ -109,7 +105,6 @@ def __init__( self, transform_type='auto', phase_invariance=False, - distance='manhattan', alpha=0.5, normalize_output=False, n_samples=None, @@ -133,7 +128,6 @@ def __init__( self.transform_type=transform_type self.phase_invariance=phase_invariance self.prime_dilations=prime_dilations - self.distance=distance self.alpha=alpha self.normalize_output=normalize_output self.n_samples=n_samples @@ -170,7 +164,6 @@ def _init_components(self): self.transformer = R_DST( transform_type=self.transform_type, phase_invariance=self.phase_invariance, - distance=self.distance, alpha=self.alpha, prime_dilations=self.prime_dilations, shapelet_lengths_bounds=self.shapelet_lengths_bounds, diff --git a/convst/transformers/_commons.py b/convst/transformers/_commons.py index 8a3dbc6..dfad845 100644 --- a/convst/transformers/_commons.py +++ b/convst/transformers/_commons.py @@ -299,23 +299,23 @@ def _get_subsequence_phase(X, i_start, length, d, normalize): cache=__USE_NUMBA_CACHE__, nogil=__USE_NUMBA_NOGIL__ ) def compute_shapelet_dist_vector( - x, values, length, dilation, dist_func, normalize, use_phase + x, values, length, dilation, normalize, use_phase ): if normalize and use_phase: return _compute_shapelet_dist_vector_norm_phase( - x, values, length, dilation, dist_func + x, values, length, dilation ) elif normalize and not use_phase: return _compute_shapelet_dist_vector_norm( - x, values, length, dilation, dist_func + x, values, length, dilation ) elif not normalize and use_phase: return _compute_shapelet_dist_vector_phase( - x, values, length, dilation, dist_func + x, values, length, dilation ) elif not normalize and not use_phase: return _compute_shapelet_dist_vector( - x, values, length, dilation, dist_func + x, values, length, dilation ) else: raise ValueError('Wrong parameter for normalize or phase') @@ -324,7 +324,7 @@ def compute_shapelet_dist_vector( @njit( cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__ ) -def _compute_shapelet_dist_vector(x, values, length, dilation, dist_func): +def _compute_shapelet_dist_vector(x, values, length, dilation): """ Compute a shapelet distance vector from an univariate time series and a dilated shapelet. Shapelet should be already normalized if normalizing @@ -353,14 +353,14 @@ def _compute_shapelet_dist_vector(x, values, length, dilation, dist_func): c = _generate_strides_1D(x, length, dilation) x_conv = zeros(c.shape[0]) for i in prange(x_conv.shape[0]): - x_conv[i] = dist_func(c[i], values) + x_conv[i] = manhattan(c[i], values) return x_conv @njit( cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__ ) -def _compute_shapelet_dist_vector_norm(x, values, length, dilation, dist_func): +def _compute_shapelet_dist_vector_norm(x, values, length, dilation): """ Compute a shapelet distance vector from an univariate time series and a dilated shapelet. Shapelet should be already normalized if normalizing @@ -390,14 +390,14 @@ def _compute_shapelet_dist_vector_norm(x, values, length, dilation, dist_func): x_conv = zeros(c.shape[0]) for i in prange(x_conv.shape[0]): x0 = (c[i] - c[i].mean())/(c[i].std()+1e-8) - x_conv[i] = dist_func(x0, values) + x_conv[i] = manhattan(x0, values) return x_conv @njit( cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__ ) -def _compute_shapelet_dist_vector_phase(x, values, length, dilation, dist_func): +def _compute_shapelet_dist_vector_phase(x, values, length, dilation): """ Compute a shapelet distance vector from an univariate time series and a dilated shapelet. Shapelet should be already normalized if normalizing @@ -426,14 +426,14 @@ def _compute_shapelet_dist_vector_phase(x, values, length, dilation, dist_func): c = _generate_strides_1D_phase(x, length, dilation) x_conv = zeros(c.shape[0]) for i in prange(x_conv.shape[0]): - x_conv[i] = dist_func(c[i], values) + x_conv[i] = manhattan(c[i], values) return x_conv @njit( cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__ ) -def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation, dist_func): +def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation): """ Compute a shapelet distance vector from an univariate time series and a dilated shapelet. Shapelet should be already normalized if normalizing @@ -463,7 +463,7 @@ def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation, dist_f x_conv = zeros(c.shape[0]) for i in prange(x_conv.shape[0]): x0 = (c[i] - c[i].mean())/(c[i].std()+1e-8) - x_conv[i] = dist_func(x0, values) + x_conv[i] = manhattan(x0, values) return x_conv @@ -471,7 +471,7 @@ def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation, dist_f @njit( cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__ ) -def apply_one_shapelet_one_sample_univariate(x, values, threshold, dist_func): +def apply_one_shapelet_one_sample_univariate(x, values, threshold): """ Extract the three features from the distance between a shapelet and the strides of an input time series generated by the length and dilation @@ -508,7 +508,7 @@ def apply_one_shapelet_one_sample_univariate(x, values, threshold, dist_func): #For each step of the moving window in the shapelet distance for i in prange(n_candidates): - _dist = dist_func(x[i], values) + _dist = manhattan(x[i], values) if _dist < _min or _min==-1.: _min = _dist @@ -523,7 +523,7 @@ def apply_one_shapelet_one_sample_univariate(x, values, threshold, dist_func): @njit( cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__ ) -def apply_one_shapelet_one_sample_multivariate(x, values, threshold, dist_func): +def apply_one_shapelet_one_sample_multivariate(x, values, threshold): """ Extract the three features from the distance between a shapelet and the strides of an input time series generated by the length and dilation @@ -562,7 +562,7 @@ def apply_one_shapelet_one_sample_multivariate(x, values, threshold, dist_func): for i in prange(n_candidates): _dist = 0 for ft in prange(n_ft): - _dist += dist_func(x[ft, i], values[ft]) + _dist += manhattan(x[ft, i], values[ft]) if _dist < _min or _min == -1.: _min = _dist diff --git a/convst/transformers/_multivariate_same_length.py b/convst/transformers/_multivariate_same_length.py index 56118ed..ba301be 100644 --- a/convst/transformers/_multivariate_same_length.py +++ b/convst/transformers/_multivariate_same_length.py @@ -105,7 +105,7 @@ def M_SL_init_random_shapelet_params( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, nogil=__USE_NUMBA_NOGIL__) def M_SL_generate_shapelet( X, y, n_shapelets, shapelet_sizes, r_seed, p_norm, p_min, p_max, alpha, - dist_func, use_phase, max_channels, prime_scheme + use_phase, max_channels, prime_scheme ): """ Given a time series dataset and parameters of the method, generate the @@ -131,9 +131,6 @@ def M_SL_generate_shapelet( Upper bound for the percentile during the choice of threshold alpha : float Alpha similarity parameter - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance prime_scheme : bool @@ -246,7 +243,7 @@ def M_SL_generate_shapelet( #Compute distance vector x_dist += compute_shapelet_dist_vector( X[id_test, _channel_ids[k]], _v, _length, _dilation, - dist_func, norm, use_phase + norm, use_phase ) _values[a3:b3] = _v @@ -296,7 +293,7 @@ def M_SL_generate_shapelet( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__) def M_SL_apply_all_shapelets( - X, shapelets, dist_func, use_phase + X, shapelets, use_phase ): """ Apply a set of generated shapelet using the parameter arrays previously @@ -319,9 +316,6 @@ def M_SL_apply_all_shapelets( Threshold parameter of the shapelets normalize : array, shape=(n_shapelets) Normalization indicator of the shapelets - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance @@ -388,7 +382,7 @@ def M_SL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_multivariate( - strides[_channels], _values, threshold[i_shp], dist_func + strides[_channels], _values, threshold[i_shp] ) _idx_norm = _idx_shp[where(normalize[_idx_shp] == True)[0]] @@ -409,6 +403,6 @@ def M_SL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_multivariate( - strides[_channels], _values, threshold[i_shp], dist_func + strides[_channels], _values, threshold[i_shp] ) return X_new \ No newline at end of file diff --git a/convst/transformers/_multivariate_variable_length.py b/convst/transformers/_multivariate_variable_length.py index bd1aaa1..025781c 100644 --- a/convst/transformers/_multivariate_variable_length.py +++ b/convst/transformers/_multivariate_variable_length.py @@ -105,7 +105,7 @@ def M_VL_init_random_shapelet_params( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, nogil=__USE_NUMBA_NOGIL__) def M_VL_generate_shapelet( X, y, n_shapelets, shapelet_sizes, r_seed, p_norm, p_min, p_max, alpha, - dist_func, use_phase, max_channels, min_len, X_len, prime_scheme + use_phase, max_channels, min_len, X_len, prime_scheme ): """ Given a time series dataset and parameters of the method, generate the @@ -131,9 +131,6 @@ def M_VL_generate_shapelet( Upper bound for the percentile during the choice of threshold alpha : float Alpha similarity parameter - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance min_len : int @@ -276,7 +273,7 @@ def M_VL_generate_shapelet( #Compute distance vector x_dist += compute_shapelet_dist_vector( X[id_test, _channel_ids[k], :X_len[id_test]], _v, _length, _dilation, - dist_func, norm, use_phase + norm, use_phase ) _values[a3:b3] = _v @@ -327,7 +324,7 @@ def M_VL_generate_shapelet( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__) def M_VL_apply_all_shapelets( - X, shapelets, dist_func, use_phase, X_len + X, shapelets, use_phase, X_len ): """ Apply a set of generated shapelet using the parameter arrays previously @@ -350,9 +347,6 @@ def M_VL_apply_all_shapelets( Threshold parameter of the shapelets normalize : array, shape=(n_shapelets) Normalization indicatorr of the shapelets - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance X_len : array, shape=(n_samples) @@ -419,7 +413,7 @@ def M_VL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_multivariate( - strides[_channels], _values, threshold[i_shp], dist_func + strides[_channels], _values, threshold[i_shp] ) _idx_norm = _idx_shp[where(normalize[_idx_shp] == True)[0]] @@ -440,6 +434,6 @@ def M_VL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_multivariate( - strides[_channels], _values, threshold[i_shp], dist_func + strides[_channels], _values, threshold[i_shp] ) return X_new \ No newline at end of file diff --git a/convst/transformers/_univariate_same_length.py b/convst/transformers/_univariate_same_length.py index 28da2a1..292d6d5 100644 --- a/convst/transformers/_univariate_same_length.py +++ b/convst/transformers/_univariate_same_length.py @@ -89,7 +89,7 @@ def U_SL_init_random_shapelet_params( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, nogil=__USE_NUMBA_NOGIL__) def U_SL_generate_shapelet( X, y, n_shapelets, shapelet_sizes, r_seed, p_norm, p_min, p_max, alpha, - dist_func, use_phase, prime_scheme + use_phase, prime_scheme ): """ Given a time series dataset and parameters of the method, generate the @@ -116,9 +116,6 @@ def U_SL_generate_shapelet( alpha : float Alpha similarity parameter, higher values (close to 1) means higher similarity prunning. - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance prime_scheme : bool @@ -207,7 +204,7 @@ def U_SL_generate_shapelet( #Compute distance vector x_dist = compute_shapelet_dist_vector( - X[id_test, 0], v, _length, _dilation, dist_func, norm, + X[id_test, 0], v, _length, _dilation, norm, use_phase ) @@ -234,7 +231,7 @@ def U_SL_generate_shapelet( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__) def U_SL_apply_all_shapelets( - X, shapelets, dist_func, use_phase + X, shapelets, use_phase ): """ Apply a set of generated shapelet using the parameter arrays previously @@ -257,9 +254,6 @@ def U_SL_apply_all_shapelets( Threshold parameter of the shapelets normalize : array, shape=(n_shapelets) Normalization indicatorr of the shapelets - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance @@ -316,7 +310,7 @@ def U_SL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_univariate( - strides, _values, threshold[i_shp], dist_func + strides, _values, threshold[i_shp] ) _idx_norm = _idx_shp[where(normalize[_idx_shp] == True)[0]] @@ -331,7 +325,7 @@ def U_SL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_univariate( - strides, _values, threshold[i_shp], dist_func + strides, _values, threshold[i_shp] ) return X_new \ No newline at end of file diff --git a/convst/transformers/_univariate_variable_length.py b/convst/transformers/_univariate_variable_length.py index 1dd66a5..13e5afd 100644 --- a/convst/transformers/_univariate_variable_length.py +++ b/convst/transformers/_univariate_variable_length.py @@ -93,7 +93,7 @@ def U_VL_init_random_shapelet_params( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, nogil=__USE_NUMBA_NOGIL__) def U_VL_generate_shapelet( X, y, n_shapelets, shapelet_sizes, r_seed, p_norm, p_min, p_max, alpha, - dist_func, use_phase, min_len, X_len, prime_scheme + use_phase, min_len, X_len, prime_scheme ): """ Given a time series dataset and parameters of the method, generate the @@ -120,9 +120,6 @@ def U_VL_generate_shapelet( Upper bound for the percentile during the choice of threshold alpha : float Alpha similarity parameter - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance min_len : int @@ -238,7 +235,7 @@ def U_VL_generate_shapelet( #Compute distance vector x_dist = compute_shapelet_dist_vector( X[id_test, 0, :X_len[id_test]], v, _length, - _dilation, dist_func, norm, use_phase + _dilation, norm, use_phase ) #Extract value between two percentile as threshold for SO @@ -264,7 +261,7 @@ def U_VL_generate_shapelet( @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__) def U_VL_apply_all_shapelets( - X, shapelets, dist_func, use_phase, X_len + X, shapelets, use_phase, X_len ): """ Apply a set of generated shapelet using the parameter arrays previously @@ -288,9 +285,6 @@ def U_VL_apply_all_shapelets( Threshold parameter of the shapelets normalize : array, shape=(n_shapelets) Normalization indicatorr of the shapelets - dist_func: function - A distance function implemented with Numba taking two 1D vectors as - input. use_phase: bool Wheter to use phase invariance X_len : array, shape=(n_samples) @@ -348,7 +342,7 @@ def U_VL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_univariate( - strides, _values, threshold[i_shp], dist_func + strides, _values, threshold[i_shp] ) _idx_norm = _idx_shp[where(normalize[_idx_shp] == True)[0]] @@ -363,7 +357,7 @@ def U_VL_apply_all_shapelets( X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \ apply_one_shapelet_one_sample_univariate( - strides, _values, threshold[i_shp], dist_func + strides, _values, threshold[i_shp] ) return X_new \ No newline at end of file diff --git a/convst/transformers/rdst.py b/convst/transformers/rdst.py index b2b4f89..fdeac01 100644 --- a/convst/transformers/rdst.py +++ b/convst/transformers/rdst.py @@ -13,10 +13,10 @@ from sklearn.utils.validation import check_is_fitted, check_random_state from convst.utils.checks_utils import ( - check_array_3D, check_array_1D, check_is_numeric, + check_array_3D, check_array_1D, check_is_numeric, check_is_boolean, check_n_jobs ) -from convst.transformers._commons import manhattan, euclidean, squared_euclidean + from numba import set_num_threads @@ -25,6 +25,7 @@ STR_UNIVARIATE_VARIABLE = 'univariate_variable' STR_MULTIVARIATE_VARIABLE = 'multivariate_variable' + class R_DST(BaseEstimator, TransformerMixin): """ Base class for RDST transformer. Depending on the parameters and of the @@ -47,10 +48,6 @@ class R_DST(BaseEstimator, TransformerMixin): phase_invariance : bool, optional Wheter to use phase invariance for shapelet sampling and distance computation. The default is False. - distance : str, optional - The distance function to use whe computing distances between shapelets - and time series. Choose between 'euclidean','manhattan' and 'squared_euclidean'. - The default is 'manhattan'. alpha : float, optional The alpha similarity parameter, the higher the value, the lower the allowed number of common indexes with previously sampled shapelets @@ -119,7 +116,6 @@ def __init__( self, transform_type='auto', phase_invariance=False, - distance='manhattan', alpha=0.5, normalize_output=False, n_samples=None, @@ -137,7 +133,6 @@ def __init__( ): self.transform_type = self._validate_transform_type(transform_type) self.phase_invariance = check_is_boolean(phase_invariance) - self.distance = self._validate_distances(distance) self.alpha = check_is_numeric(alpha) self.normalize_output = check_is_boolean(normalize_output) self.n_samples = check_is_numeric(n_samples) if n_samples is not None else n_samples @@ -230,7 +225,7 @@ def fit(self, X, y): if self.max_channels is None: self.max_channels = n_features - + self.shapelet_lengths = self._set_lengths() shapelet_lengths, seed = self._check_params(self.min_len) @@ -239,28 +234,26 @@ def fit(self, X, y): self.shapelets_ = self.fitter( X, y, self.n_shapelets, shapelet_lengths, seed, self.proba_norm, self.percentiles[0], self.percentiles[1], self.alpha, - self._get_distance_function(), self.phase_invariance, - self.min_len, X_len, self.prime_dilations + self.phase_invariance, self.min_len, X_len, self.prime_dilations ) elif self.transform_type == STR_MULTIVARIATE_VARIABLE: self.shapelets_ = self.fitter( X, y, self.n_shapelets, shapelet_lengths, seed, self.proba_norm, self.percentiles[0], self.percentiles[1], self.alpha, - self._get_distance_function(), self.phase_invariance, - self.max_channels, self.min_len, X_len, self.prime_dilations + self.phase_invariance, self.max_channels, self.min_len, X_len, + self.prime_dilations ) elif self.transform_type == STR_MUTLIVARIATE: self.shapelets_ = self.fitter( X, y, self.n_shapelets, shapelet_lengths, seed, self.proba_norm, self.percentiles[0], self.percentiles[1], self.alpha, - self._get_distance_function(), self.phase_invariance, - self.max_channels, self.prime_dilations + self.phase_invariance, self.max_channels, self.prime_dilations ) elif self.transform_type == STR_UNIVARIATE: self.shapelets_ = self.fitter( X, y, self.n_shapelets, shapelet_lengths, seed, self.proba_norm, self.percentiles[0], self.percentiles[1], self.alpha, - self._get_distance_function(), self.phase_invariance, self.prime_dilations + self.phase_invariance, self.prime_dilations ) else: raise ValueError('Unknown value for transform type parameter') @@ -292,14 +285,12 @@ def transform(self, X): X, X_len = self._format_uneven_timestamps(X) X = check_array_3D(X).astype(np.float64) X_new = self.transformer( - X, self.shapelets_ , self._get_distance_function(), - self.phase_invariance, X_len + X, self.shapelets_, self.phase_invariance, X_len ) else: X = check_array_3D(X).astype(np.float64) X_new = self.transformer( - X, self.shapelets_, self._get_distance_function(), - self.phase_invariance + X, self.shapelets_, self.phase_invariance ) return X_new @@ -391,32 +382,6 @@ def _set_fit_transform(self, X): else: raise ValueError('Unknwon transform type parameter') - - - def _get_distance_function(self): - """ - Based on the distance parameter, return the distance function to be - used during the shapelet generation and transform. - - Raises - ------ - ValueError - If the value of the distance parameter is not in ['euclidean', - 'squared','manhattan'], raise a ValueError. - - Returns - ------- - function - Return the numba function based on the distance parameter. - - """ - if self.distance == 'euclidean': - return euclidean - if self.distance == 'squared': - return squared_euclidean - if self.distance == 'manhattan': - return manhattan - raise ValueError('Wrong distance parameter value, got {}'.format(self.distance)) def _format_uneven_timestamps(self, X): """ @@ -531,10 +496,4 @@ def _validate_percentiles(self, percentiles): if percentiles[0] <= percentiles[1]: return percentiles raise ValueError('Wrong percetniles parameter value, got {}, expected a numerical array of size 2'.format(percentiles)) - - def _validate_distances(self, distance_str): - distance_str = distance_str.lower() - valid = ['euclidean','squared','manhattan'] - if distance_str not in valid: - raise ValueError('Wrong distance parameter value, got {}, valid ones are {}'.format(distance_str, valid)) - return distance_str \ No newline at end of file + \ No newline at end of file diff --git a/convst/utils/checks_utils.py b/convst/utils/checks_utils.py index e381500..1777bab 100644 --- a/convst/utils/checks_utils.py +++ b/convst/utils/checks_utils.py @@ -3,8 +3,6 @@ import numpy as np import pandas as pd from os import cpu_count -from sktime.datatypes._panel._convert import from_nested_to_3d_numpy -from sktime.datatypes._panel._check import is_nested_dataframe def is_int(x): """Check if x is of integer type, but not boolean.""" @@ -65,7 +63,6 @@ def check_array_3D(X, coerce_to_numpy=True, is_univariate=False, min_timestamps= Raises ------ ValueError - Returns ------- @@ -90,13 +87,7 @@ def check_array_3D(X, coerce_to_numpy=True, is_univariate=False, min_timestamps= ", found only: {}".format(min_timestamps,X.shape[2]) ) if isinstance(X, pd.DataFrame): - if not is_nested_dataframe(X): - raise ValueError( - "If passed as a pd.DataFrame, X must be a nested " - "pd.DataFrame, with pd.Series or np.arrays inside cells." - ) - if coerce_to_numpy: - X = from_nested_to_3d_numpy(X) + raise ValueError('Only accepting numpy array as inputs for 3D') if is_univariate: if X.shape[1] != 1: raise ValueError( diff --git a/convst/utils/dataset_utils.py b/convst/utils/dataset_utils.py index bf6dc18..5ee0608 100644 --- a/convst/utils/dataset_utils.py +++ b/convst/utils/dataset_utils.py @@ -1,25 +1,14 @@ # -*- coding: utf-8 -*- import numpy as np -from sktime.datasets import (load_UCR_UEA_dataset, - load_from_tsfile_to_dataframe, load_from_arff_to_dataframe, - -) -from sktime.datatypes._panel._convert import (from_multiindex_to_dflist, - from_nested_to_multi_index -) + +from aeon.datasets._data_loaders import _load_dataset + from sklearn.preprocessing import LabelEncoder from numba import njit, prange from convst import __USE_NUMBA_CACHE__ -def _custom_from_nested_to_3d_numpy(X): - X = from_multiindex_to_dflist(from_nested_to_multi_index(X)) - if all([X[i].shape[0] == X[0].shape[0] for i in range(len(X))]): - return np.array([X[i].values.T for i in range(len(X))]) - else: - return [X[i].values.T for i in range(len(X))] - @njit(cache=__USE_NUMBA_CACHE__) def z_norm_3D(X): """ @@ -66,10 +55,10 @@ def z_norm_3D_list(X): return X -def load_sktime_dataset_split(name, normalize=False): +def load_UCR_UEA_dataset_split(name, normalize=False): """ Load the original train and test splits of a dataset - from the UCR/UEA archive by name using sktime API. + from the UCR/UEA archive by name using aeon API. Parameters ---------- @@ -95,12 +84,8 @@ def load_sktime_dataset_split(name, normalize=False): """ #Load datasets - X_train, y_train = load_UCR_UEA_dataset(name, return_X_y=True, split='train') - X_test, y_test = load_UCR_UEA_dataset(name, return_X_y=True, split='test') - - #Convert pandas DataFrames to numpy arrays - X_train = _custom_from_nested_to_3d_numpy(X_train) - X_test = _custom_from_nested_to_3d_numpy(X_test) + X_train, y_train = _load_dataset(name, return_X_y=True, split='train') + X_test, y_test = _load_dataset(name, return_X_y=True, split='test') #Convert class labels to make sure they are between 0,n_classes le = LabelEncoder().fit(y_train) @@ -120,163 +105,9 @@ def load_sktime_dataset_split(name, normalize=False): return X_train, X_test, y_train, y_test, min_len -def load_sktime_arff_file(path, normalize=False): +def load_UCR_UEA_dataset(name, normalize=False): """ - Load a dataset from .arff files. - - Parameters - ---------- - path : string - Path to the folder containing the .ts file. Dataset name - should be specified at the end of path to find files as - "dataset_TRAIN.arff" and "dataset_TEST.arff" - normalize : boolean, optional - If True, time series will be z-normalized. The default is True. - - - Returns - ------- - X_train : array, shape=(n_samples_train, n_features, n_timestamps) - Training data from the dataset specified by path. - X_test : array, shape=(n_samples_test, n_features, n_timestamps) - Testing data from the dataset specified by path. - y_train : array, shape=(n_samples_train) - Class of the training data. - y_test : array, shape=(n_samples_test) - Class of the testing data. - le : LabelEncoder - LabelEncoder object used to uniformize the class labels - - """ - #Load datasets - X_train, y_train = load_from_arff_to_dataframe(path+'_TRAIN.arff') - X_test, y_test = load_from_arff_to_dataframe(path+'_TEST.arff') - - #Convert pandas DataFrames to numpy arrays - X_train = _custom_from_nested_to_3d_numpy(X_train) - X_test = _custom_from_nested_to_3d_numpy(X_test) - - #Convert class labels to make sure they are between 0,n_classes - le = LabelEncoder().fit(y_train) - y_train = le.transform(y_train) - y_test = le.transform(y_test) - - #Z-Normalize the data - if normalize: - X_train = (X_train - X_train.mean(axis=-1, keepdims=True)) / ( - X_train.std(axis=-1, keepdims=True) + 1e-8) - X_test = (X_test - X_test.mean(axis=-1, keepdims=True)) / ( - X_test.std(axis=-1, keepdims=True) + 1e-8) - - return X_train, X_test, y_train, y_test, le - - -def load_sktime_arff_file_resample_id(path, rs_id, normalize=False): - """ - Load a dataset resample from .arff files and the identifier of the - resample. - - Parameters - ---------- - path : string - Path to the folder containing the .ts file. Dataset name - should be specified at the end of path to find files as - "dataset_{rs_id}_TRAIN.arff" and "dataset_{rs_id}_TEST.arff" - rs_id : int or str - Identifier of the resample. - normalize : boolean, optional - If True, time series will be z-normalized. The default is True. - - - Returns - ------- - X_train : array, shape=(n_samples_train, n_features, n_timestamps) - Training data from the dataset specified by path. - X_test : array, shape=(n_samples_test, n_features, n_timestamps) - Testing data from the dataset specified by path. - y_train : array, shape=(n_samples_train) - Class of the training data. - y_test : array, shape=(n_samples_test) - Class of the testing data. - le : LabelEncoder - LabelEncoder object used to uniformize the class labels - - """ - #Load datasets - X_train, y_train = load_from_arff_to_dataframe(path+'_{}_TRAIN.arff'.format(rs_id)) - X_test, y_test = load_from_arff_to_dataframe(path+'_{}_TEST.arff'.format(rs_id)) - - #Convert pandas DataFrames to numpy arrays - X_train = _custom_from_nested_to_3d_numpy(X_train) - X_test = _custom_from_nested_to_3d_numpy(X_test) - - #Convert class labels to make sure they are between 0,n_classes - le = LabelEncoder().fit(y_train) - y_train = le.transform(y_train) - y_test = le.transform(y_test) - - #Z-Normalize the data - if normalize: - X_train = (X_train - X_train.mean(axis=-1, keepdims=True)) / ( - X_train.std(axis=-1, keepdims=True) + 1e-8) - X_test = (X_test - X_test.mean(axis=-1, keepdims=True)) / ( - X_test.std(axis=-1, keepdims=True) + 1e-8) - - return X_train, X_test, y_train, y_test, le - -def load_sktime_ts_file(path, normalize=False): - """ - Load a dataset from .ts files - - Parameters - ---------- - path : string - Path to the folder containing the .ts file. Dataset name - should be specified at the end of path to find files as - "dataset_TRAIN.ts" and "dataset_TEST.ts" - normalize : boolean, optional - If True, time series will be z-normalized. The default is True. - - Returns - ------- - X_train : array, shape=(n_samples_train, n_features, n_timestamps) - Training data from the dataset specified by path. - X_test : array, shape=(n_samples_test, n_features, n_timestamps) - Testing data from the dataset specified by path. - y_train : array, shape=(n_samples_train) - Class of the training data. - y_test : array, shape=(n_samples_test) - Class of the testing data. - le : LabelEncoder - LabelEncoder object used to uniformize the class labels - - """ - - #Load datasets - X_train, y_train = load_from_tsfile_to_dataframe(path+'_TRAIN.ts') - X_test, y_test = load_from_tsfile_to_dataframe(path+'_TEST.ts') - - #Convert pandas DataFrames to numpy arrays - X_train = _custom_from_nested_to_3d_numpy(X_train) - X_test = _custom_from_nested_to_3d_numpy(X_test) - - #Convert class labels to make sure they are between 0,n_classes - le = LabelEncoder().fit(y_train) - y_train = le.transform(y_train) - y_test = le.transform(y_test) - - #Z-Normalize the data - if normalize: - X_train = (X_train - X_train.mean(axis=-1, keepdims=True)) / ( - X_train.std(axis=-1, keepdims=True) + 1e-8) - X_test = (X_test - X_test.mean(axis=-1, keepdims=True)) / ( - X_test.std(axis=-1, keepdims=True) + 1e-8) - - return X_train, X_test, y_train, y_test, le - -def load_sktime_dataset(name, normalize=False): - """ - Load a dataset from the UCR/UEA archive by name using sktime API + Load a dataset from the UCR/UEA archive by name using aeon API Parameters ---------- @@ -291,17 +122,12 @@ def load_sktime_dataset(name, normalize=False): Time series data from the dataset specified by name. y : array, shape=(n_samples) Class of the time series - le : LabelEncoder - LabelEncoder object used to uniformize the class labels - - """ #Load datasets - X_train, X_test, y_train, y_test, le = load_sktime_dataset_split( + X_train, X_test, y_train, y_test, _ = load_UCR_UEA_dataset_split( name, normalize=normalize ) - - return np.concatenate((X_train, X_test),axis=0), np.concatenate((y_train, y_test),axis=0), le + return np.concatenate((X_train, X_test),axis=0), np.concatenate((y_train, y_test),axis=0) def return_all_dataset_names(): return np.concatenate(( diff --git a/convst/utils/experiments_utils.py b/convst/utils/experiments_utils.py index 4a1dc15..3f89a64 100644 --- a/convst/utils/experiments_utils.py +++ b/convst/utils/experiments_utils.py @@ -15,11 +15,11 @@ from timeit import default_timer as timer -from convst.utils.dataset_utils import load_sktime_arff_file_resample_id, load_sktime_dataset_split +from convst.utils.dataset_utils import load_UCR_UEA_dataset_split #Reuse of the sktime function, modified for numpy array inputs rather than dataframes # and to handle variable length series -def _sktime_resample(X_train, y_train, X_test, y_test, random_state): +def _resample(X_train, y_train, X_test, y_test, random_state): """Stratified resample data without replacement using a random state. Reproducable resampling. Combines train and test, resamples to get the same class @@ -108,7 +108,7 @@ def __init__(self, n_split, dataset_name, scorers={"accuracy":accuracy_score}): self.scorers = scorers def score(self, pipeline): - X_train_0, X_test_0, y_train_0, y_test_0, _ = load_sktime_dataset_split( + X_train_0, X_test_0, y_train_0, y_test_0, _ = load_UCR_UEA_dataset_split( self.dataset_name ) _score = pd.DataFrame() @@ -125,7 +125,7 @@ def score(self, pipeline): y_train = np.copy(y_train_0) y_test = np.copy(y_test_0) else: - X_train, y_train, X_test, y_test = _sktime_resample( + X_train, y_train, X_test, y_test = _resample( X_train_0, y_train_0, X_test_0, y_test_0, i ) t0 = timer() @@ -138,13 +138,13 @@ def score(self, pipeline): return _score -class _sklearn_sktime_cv: +class _sklearn_cv: def __init__(self, n_splits, dataset_name): self.n_splits=n_splits self.dataset_name=dataset_name def split(self, X, y=None, groups=None): - X_train_0, X_test_0, y_train_0, y_test_0, _ = load_sktime_dataset_split( + X_train_0, X_test_0, y_train_0, y_test_0, _ = load_UCR_UEA_dataset_split( self.dataset_name ) for i in range(self.n_splits): @@ -154,7 +154,7 @@ def split(self, X, y=None, groups=None): y_train = np.copy(y_train_0) y_test = np.copy(y_test_0) else: - X_train, y_train, X_test, y_test = _sktime_resample( + X_train, y_train, X_test, y_test = _resample( X_train_0, y_train_0, X_test_0, y_test_0, i ) idx_Train = [np.where((X == X_train[j]).all(axis=2))[0][0] for j in range(X_train.shape[0])] @@ -174,10 +174,10 @@ def __init__(self, n_split, dataset_name, n_jobs, scorers=make_scorer(accuracy_s def score(self, pipeline, params): cv = GridSearchCV( pipeline(), params, scoring=self.scorers, n_jobs=self.n_jobs, - cv=_sklearn_sktime_cv(self.n_split, self.dataset_name), pre_dispatch='n_jobs', + cv=_sklearn_cv(self.n_split, self.dataset_name), pre_dispatch='n_jobs', verbose=3 ) - X_train_0, X_test_0, y_train_0, y_test_0, _ = load_sktime_dataset_split( + X_train_0, X_test_0, y_train_0, y_test_0, _ = load_UCR_UEA_dataset_split( self.dataset_name ) X = np.concatenate((X_train_0, X_test_0),axis=0) @@ -185,140 +185,3 @@ def score(self, pipeline, params): cv.fit(X, y) return pd.DataFrame(cv.cv_results_) - - - -class ARFF_stratified_resample: - """ - Class used as a splitter for sklearn cross validation tools. - It will take previsouly resampled arff files at a location and - return a resample based on the identifier of the current cross - validation step. - - It is used to reproduce the exact same splits made in the original UCR/UEA - archive. The arff files can be produced using the tsml java implementation. - - Parameters - ---------- - n_splits : int - Number of cross validation step planed. - path : string - Path to the arff files. - - """ - def __init__(self, n_splits, path, normalize=False): - self.n_splits=n_splits - self.path=path - self.normalize=normalize - - def split(self, X, y=None, groups=None): - """ - - - Parameters - ---------- - X : array, shape=(n_samples, n_features, n_timestamps) - Time series data to split - y : ignored - - groups : ignored - - - Yields - ------ - idx_Train : array, shape(n_samples_train) - Index of the training data in the original dataset X. - idx_Test : array, shape(n_samples_test) - Index of the testing data in the original dataset X. - - """ - for i in range(self.n_splits): - X_train, X_test, y_train, y_test, _ = load_sktime_arff_file_resample_id(self.path, i, normalize=self.normalize) - idx_Train = [np.where((X == X_train[j]).all(axis=2))[0][0] for j in range(X_train.shape[0])] - idx_Test = [np.where((X == X_test[j]).all(axis=2))[0][0] for j in range(X_test.shape[0])] - yield idx_Train, idx_Test - - def get_n_splits(self, X=None, y=None, groups=None): - """ - Return the number of split made by the splitter. - - - Parameters - ---------- - X : ignored - - y : ignored - - groups : ignored - - - Returns - ------- - n_splits : int - The n_splits attribute of the object. - - """ - return self.n_splits - -# To be used with ARFF_stratified_resample -def run_pipeline(pipeline, X_train, X_test, y_train, y_test, splitter, n_jobs=1): - """ - Run a sklearn compatible model or pipeline on the specified dataset. - - Parameters - ---------- - pipeline : object - A sklearn compatible model or pipeline - X_train : array, shape=(n_samples, n_feature, n_timestamps) - Input training data - X_test : array, shape=(n_samples, n_feature, n_timestamps) - Input testing data - y_train : array, shape=(n_samples) - Class of the input training data. - y_test : array, shape=(n_samples) - Class of the input testing data. - splitter : object - A sklearn compatible splitter for cross-validation. - n_jobs : int, optional - Number of parallel validation round. The default is 1. - - Raises - ------ - ValueError - If the number of split of the splitter is an invalid value, raise a - ValueError exception. - - Returns - ------- - float - Mean accuracy over all validation splits - float - Std of accuracy over all validation splits - float - Mean F1-score over all validation splits - float - Std F1-score over all validation splits - float - Mean runtime over all validation splits - float - Std runtime over all validation splits - - """ - if splitter.n_splits > 1: - X = np.concatenate([X_train, X_test], axis=0).astype(np.float64) - y = np.concatenate([y_train, y_test], axis=0).astype(np.int64) - cv = cross_validate(pipeline, X, y, cv=splitter, n_jobs=n_jobs, - scoring={'f1': make_scorer(f1_score, average='macro'), - 'acc':make_scorer(accuracy_score)}) - return np.mean(cv['test_acc']), np.std(cv['test_acc']), np.mean(cv['test_f1']), np.std(cv['test_f1']), np.mean(cv['fit_time'] + cv['score_time']), np.std(cv['fit_time'] + cv['score_time']) - - if splitter.n_splits == 1: - #change datetime to context accurate timing - t0 = timer() - pipeline = pipeline.fit(X_train, y_train) - pred = pipeline.predict(X_test) - t1 = timer() - return accuracy_score(y_test, pred), 0, f1_score(y_test, pred, average='macro'), 0, (t1-t0), 0 - - raise ValueError("Invalid value for n_split in splitter," - " got {} splits".format(splitter.n_splits)) diff --git a/pyproject.toml b/pyproject.toml index 2bce2d5..877e84b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "convst" -version = "0.2.7" +version = "0.3.0" description = "The Random Dilation Shapelet Transform algorithm and associated works" readme = "README.md" @@ -27,19 +27,19 @@ classifiers = [ "Operating System :: POSIX", "Operating System :: Unix", "Operating System :: MacOS", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", ] -requires-python = ">=3.7,<3.11" +requires-python = ">=3.8,<3.11" dependencies = [ - "sktime>=0.15", + "aeon>=0.3", "numba>=0.55", "numpy>=1.21.0,<1.25", + "scikit-learn>=1.0.0,<1.3.0", + "scipy<2.0.0,>=1.2.0", "pandas>=1.1.0,<1.6.0", "joblib>=1.1.1", - "scikit-learn>=0.24.0,<1.3.0", "statsmodels>=0.12.1", "scipy<2.0.0,>=1.2.0", "matplotlib>=3.1", diff --git a/tests/test_dataset_utils.py b/tests/test_dataset_utils.py index b758cd4..a31e2c5 100644 --- a/tests/test_dataset_utils.py +++ b/tests/test_dataset_utils.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from convst.utils.dataset_utils import ( - load_sktime_dataset_split, load_sktime_dataset, z_norm_3D + load_UCR_UEA_dataset_split, load_UCR_UEA_dataset, z_norm_3D ) import numpy as np import pytest @@ -27,8 +27,8 @@ def test_z_norm_3D(): @pytest.mark.parametrize("name", [ ('GunPoint'), ('SmoothSubspace'), ]) -def test_load_sktime_dataset_split(name): - X_train, X_test, y_train, y_test, le = load_sktime_dataset_split( +def test_load_UCR_UEA_dataset_split(name): + X_train, X_test, y_train, y_test, le = load_UCR_UEA_dataset_split( name=name, normalize=False ) if name == 'GunPoint': @@ -44,8 +44,8 @@ def test_load_sktime_dataset_split(name): @pytest.mark.parametrize("name", [ ('GunPoint'), ('SmoothSubspace'), ]) -def test_load_sktime_dataset(name): - X, y, le = load_sktime_dataset( +def test_load_UCR_UEA_dataset(name): + X, y = load_UCR_UEA_dataset( name=name, normalize=False ) if name == 'GunPoint': diff --git a/tests/test_rdst.py b/tests/test_rdst.py index 0e65d49..3dfb8cc 100644 --- a/tests/test_rdst.py +++ b/tests/test_rdst.py @@ -10,7 +10,7 @@ from convst.classifiers import R_DST_Ridge from convst.transformers import R_DST -from convst.utils.dataset_utils import load_sktime_dataset_split +from convst.utils.dataset_utils import load_UCR_UEA_dataset_split from convst.utils.experiments_utils import cross_validate_UCR_UEA from convst.transformers._commons import is_prime @@ -25,7 +25,7 @@ ('AsphaltObstaclesCoordinates','multivariate_variable') ]) def test_auto_type(name, expected): - X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + X_train, X_test, y_train, y_test, min_len = load_UCR_UEA_dataset_split( name=name ) rdst = R_DST(n_shapelets=2, min_len=min_len).fit(X_train, y_train) @@ -51,7 +51,7 @@ def test_auto_type(name, expected): ('AsphaltObstaclesCoordinates',[0.05,0.08,0.1]) ]) def test_mutliple_lengths(name, lengths): - X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + X_train, X_test, y_train, y_test, min_len = load_UCR_UEA_dataset_split( name=name ) try: @@ -68,7 +68,7 @@ def test_mutliple_lengths(name, lengths): ('FordB') ]) def test_prime_dilations(name): - X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + X_train, X_test, y_train, y_test, min_len = load_UCR_UEA_dataset_split( name=name ) try: @@ -91,7 +91,7 @@ def test_prime_dilations(name): ('GunPoint', [0.1, 0.15], 0.5, [15, 17, 19, 21]) ]) def test_length_bounds(name, bounds, reduction, expected): - X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + X_train, X_test, y_train, y_test, min_len = load_UCR_UEA_dataset_split( name=name ) try: @@ -116,7 +116,7 @@ def test_length_bounds(name, bounds, reduction, expected): ('AsphaltObstaclesCoordinates',0.78) ]) def test_performance(name, expected): - X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + X_train, X_test, y_train, y_test, min_len = load_UCR_UEA_dataset_split( name=name ) rdst = R_DST_Ridge(n_shapelets=1,min_len=min_len).fit(X_train, y_train)