Merge pull request #48 from baraline/key-error

correcting key error issue, removing sktime as dependency, replaced by aeon
baraline · Jun 12, 2023 · 597d852 · 597d852
2 parents 6344cf4 + eb58938
commit 597d852
Show file tree

Hide file tree

Showing 17 changed files with 96 additions and 496 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -37,7 +37,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04, macOS-11]
-        python-version: ['3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10']
 
     steps:
       - uses: actions/checkout@v2
@@ -67,11 +67,6 @@ jobs:
       matrix:
         include:
           # Window 64 bit
-          - os: windows-2019
-            python: 37
-            python-version: '3.7'
-            bitness: 64
-            platform_id: win_amd64
           - os: windows-2019
             python: 38
             python-version: '3.8'

diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # This package is moving to the aeon-toolkit.
-Starting from v0.2.7, this package will not be updated. You can already find an updated version of RDST in the Aeon package at https://github.com/aeon-toolkit/aeon/blob/main/aeon/transformations/panel/dilated_shapelet_transform.py . This version should allow for faster transformation times due to using online normalization and better normalized distance computation, it uses the euclidean distance by default instead of the manhattan distance in convst. All the functionnalities of this package will be ported into Aeon when I got some time, for now, only the transformer for univariate and multivariate series of even length have been implemented.
+Starting from v0.3.0, this package will not be updated, bugfixes will still be included if issues are raised.
+You can already find an updated version of RDST in the Aeon package at https://github.com/aeon-toolkit/ . Further improvements are planned for further speeding up RDST, these improvement will only be implemented in aeon.
+All the functionnalities of this package will be ported into Aeon when I got some time, for now, only the transformer for univariate and multivariate series of even length have been implemented.
 
 # Readme
 Welcome to the convst repository. It contains the implementation of the `Random Dilated Shapelet Transform (RDST)` along with other works in the same area.
@@ -34,17 +36,18 @@ We recommend doing this in a new virtual environment using anaconda to avoid any
 
 An optional dependency that can help speed up numba, which is used in our implementation, is the Intel vector math library (SVML). When using conda it can be installed by running `conda install -c numba icc_rt`. I didn't test the behavior with AMD processors, but I suspect it won't work.
 
-If you are using RDST in some specific settings such as an HPC cluster and are getting errors, take a loot at [issue #24](https://github.com/baraline/convst/issues/24), you may need to change the numba compilation settings to not using function caching (see [this example](https://github.com/baraline/convst/blob/main/examples/Changing_numba_options.py)).
+If you are using RDST in some specific settings such as an HPC cluster and are getting errors, take a loot at [issue #24](https://github.com/baraline/convst/issues/24), you may need to change the numba compilation settings to not using function caching (see [this example](https://github.com/baraline/convst/blob/main/examples/Changing_numba_options.py)). THIS SHOULD BE FIXED WITH v0.3.0
+
 
 ## Tutorial
-We give here a minimal example to run the `RDST` algorithm on any dataset of the UCR archive using the sktime API to get datasets:
+We give here a minimal example to run the `RDST` algorithm on any dataset of the UCR archive using the aeon API to get datasets:
 
 ```python
 
 from convst.classifiers import R_DST_Ridge
-from convst.utils.dataset_utils import load_sktime_dataset_split
+from convst.utils.dataset_utils import load_UCR_UEA_dataset_split
 
-X_train, X_test, y_train, y_test, _ = load_sktime_dataset_split('GunPoint')
+X_train, X_test, y_train, y_test, _ = load_UCR_UEA_dataset_split('GunPoint')
 
 # First run may be slow due to numba compilations on the first call. 
 # Run a small dataset like GunPoint if this is the first time you call RDST on your system.

diff --git a/convst/__init__.py b/convst/__init__.py
@@ -1,6 +1,6 @@
 
 __author__ = 'Antoine Guillaume antoine.guillaume45@gmail.com'
-__version__ = "0.2.7"
+__version__ = "0.3.0"
 
 __all__ = ['transformers', 'classifiers', 'utils', 'interpreters']
 

diff --git a/convst/classifiers/rdst_ensemble.py b/convst/classifiers/rdst_ensemble.py
@@ -76,10 +76,6 @@ class R_DST_Ensemble(BaseEstimator, ClassifierMixin):
     phase_invariance : bool, optional
         Wheter to use phase invariance for shapelet sampling and distance 
         computation. The default is False.
-    distance : str, optional
-        The distance function to use whe computing distances between shapelets
-        and time series. Choose between 'euclidean','manhattan' and 'squared_euclidean'.
-        The default is 'manhattan'.
     alpha : float, optional
         The alpha similarity parameter, the higher the value, the lower the 
         allowed number of common indexes with previously sampled shapelets 
@@ -159,7 +155,6 @@ def __init__(
         phase_invariance=False,
         input_transformers=None,
         transform_type='auto',
-        distance='manhattan',
         normalize_output=False,
         percentiles=[5,10],
         max_channels=None,
@@ -170,7 +165,6 @@ def __init__(
     ):
         self.transform_type = transform_type
         self.phase_invariance = check_is_boolean(phase_invariance)
-        self.distance = distance
         self.normalize_output = check_is_boolean(normalize_output)
         self.n_samples = check_is_numeric(n_samples) if n_samples is not None else n_samples
         self.shapelet_lengths_bounds = shapelet_lengths_bounds

diff --git a/convst/classifiers/rdst_ridge.py b/convst/classifiers/rdst_ridge.py
@@ -38,10 +38,6 @@ class R_DST_Ridge(BaseEstimator, ClassifierMixin):
     phase_invariance : bool, optional
         Wheter to use phase invariance for shapelet sampling and distance 
         computation. The default is False.
-    distance : str, optional
-        The distance function to use whe computing distances between shapelets
-        and time series. Choose between 'euclidean','manhattan' and 'squared_euclidean'.
-        The default is 'manhattan'.
     alpha : float, optional
         The alpha similarity parameter, the higher the value, the lower the 
         allowed number of common indexes with previously sampled shapelets 
@@ -109,7 +105,6 @@ def __init__(
         self, 
         transform_type='auto',
         phase_invariance=False,
-        distance='manhattan',
         alpha=0.5,
         normalize_output=False,
         n_samples=None,
@@ -133,7 +128,6 @@ def __init__(
         self.transform_type=transform_type
         self.phase_invariance=phase_invariance
         self.prime_dilations=prime_dilations
-        self.distance=distance
         self.alpha=alpha
         self.normalize_output=normalize_output
         self.n_samples=n_samples
@@ -170,7 +164,6 @@ def _init_components(self):
         self.transformer = R_DST(
             transform_type=self.transform_type,
             phase_invariance=self.phase_invariance,
-            distance=self.distance,
             alpha=self.alpha,
             prime_dilations=self.prime_dilations,
             shapelet_lengths_bounds=self.shapelet_lengths_bounds,

diff --git a/convst/transformers/_commons.py b/convst/transformers/_commons.py
@@ -299,23 +299,23 @@ def _get_subsequence_phase(X, i_start, length, d, normalize):
   cache=__USE_NUMBA_CACHE__, nogil=__USE_NUMBA_NOGIL__
 )
 def compute_shapelet_dist_vector(
-    x, values, length, dilation, dist_func, normalize, use_phase
+    x, values, length, dilation, normalize, use_phase
 ):
     if normalize and use_phase:
         return _compute_shapelet_dist_vector_norm_phase(
-            x, values, length, dilation, dist_func
+            x, values, length, dilation
         )
     elif normalize and not use_phase:
         return _compute_shapelet_dist_vector_norm(
-            x, values, length, dilation, dist_func
+            x, values, length, dilation
         )
     elif not normalize and use_phase:
         return _compute_shapelet_dist_vector_phase(
-            x, values, length, dilation, dist_func
+            x, values, length, dilation
         )
     elif not normalize and not use_phase:
         return _compute_shapelet_dist_vector(
-            x, values, length, dilation, dist_func
+            x, values, length, dilation
         )
     else:
         raise ValueError('Wrong parameter for normalize or phase')
@@ -324,7 +324,7 @@ def compute_shapelet_dist_vector(
 @njit(
   cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__
 )
-def _compute_shapelet_dist_vector(x, values, length, dilation, dist_func):
+def _compute_shapelet_dist_vector(x, values, length, dilation):
     """
     Compute a shapelet distance vector from an univariate time series 
     and a dilated shapelet. Shapelet should be already normalized if normalizing
@@ -353,14 +353,14 @@ def _compute_shapelet_dist_vector(x, values, length, dilation, dist_func):
     c = _generate_strides_1D(x, length, dilation)
     x_conv = zeros(c.shape[0])
     for i in prange(x_conv.shape[0]):
-        x_conv[i] = dist_func(c[i], values)
+        x_conv[i] = manhattan(c[i], values)
     return x_conv
 
 
 @njit(
   cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__
 )
-def _compute_shapelet_dist_vector_norm(x, values, length, dilation, dist_func):
+def _compute_shapelet_dist_vector_norm(x, values, length, dilation):
     """
     Compute a shapelet distance vector from an univariate time series 
     and a dilated shapelet. Shapelet should be already normalized if normalizing
@@ -390,14 +390,14 @@ def _compute_shapelet_dist_vector_norm(x, values, length, dilation, dist_func):
     x_conv = zeros(c.shape[0])
     for i in prange(x_conv.shape[0]):
         x0 = (c[i] - c[i].mean())/(c[i].std()+1e-8)
-        x_conv[i] = dist_func(x0, values)
+        x_conv[i] = manhattan(x0, values)
     return x_conv
 
 
 @njit(
   cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__
 )
-def _compute_shapelet_dist_vector_phase(x, values, length, dilation, dist_func):
+def _compute_shapelet_dist_vector_phase(x, values, length, dilation):
     """
     Compute a shapelet distance vector from an univariate time series 
     and a dilated shapelet. Shapelet should be already normalized if normalizing
@@ -426,14 +426,14 @@ def _compute_shapelet_dist_vector_phase(x, values, length, dilation, dist_func):
     c = _generate_strides_1D_phase(x, length, dilation)
     x_conv = zeros(c.shape[0])
     for i in prange(x_conv.shape[0]):
-        x_conv[i] = dist_func(c[i], values)
+        x_conv[i] = manhattan(c[i], values)
     return x_conv
 
 
 @njit(
   cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__
 )
-def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation, dist_func):
+def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation):
     """
     Compute a shapelet distance vector from an univariate time series 
     and a dilated shapelet. Shapelet should be already normalized if normalizing
@@ -463,15 +463,15 @@ def _compute_shapelet_dist_vector_norm_phase(x, values, length, dilation, dist_f
     x_conv = zeros(c.shape[0])
     for i in prange(x_conv.shape[0]):
         x0 = (c[i] - c[i].mean())/(c[i].std()+1e-8)
-        x_conv[i] = dist_func(x0, values)
+        x_conv[i] = manhattan(x0, values)
     return x_conv
 
 
 
 @njit(
   cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__
 )
-def apply_one_shapelet_one_sample_univariate(x, values, threshold, dist_func):
+def apply_one_shapelet_one_sample_univariate(x, values, threshold):
     """
     Extract the three features from the distance between a shapelet and the 
     strides of an input time series generated by the length and dilation 
@@ -508,7 +508,7 @@ def apply_one_shapelet_one_sample_univariate(x, values, threshold, dist_func):
 
     #For each step of the moving window in the shapelet distance
     for i in prange(n_candidates):
-        _dist = dist_func(x[i], values)
+        _dist = manhattan(x[i], values)
 
         if _dist < _min or _min==-1.:
             _min = _dist
@@ -523,7 +523,7 @@ def apply_one_shapelet_one_sample_univariate(x, values, threshold, dist_func):
 @njit(
   cache=__USE_NUMBA_CACHE__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__
 )
-def apply_one_shapelet_one_sample_multivariate(x, values, threshold, dist_func):
+def apply_one_shapelet_one_sample_multivariate(x, values, threshold):
     """
     Extract the three features from the distance between a shapelet and the 
     strides of an input time series generated by the length and dilation 
@@ -562,7 +562,7 @@ def apply_one_shapelet_one_sample_multivariate(x, values, threshold, dist_func):
     for i in prange(n_candidates):
         _dist = 0
         for ft in prange(n_ft):
-            _dist += dist_func(x[ft, i], values[ft])
+            _dist += manhattan(x[ft, i], values[ft])
 
         if _dist < _min or _min == -1.:
             _min = _dist

diff --git a/convst/transformers/_multivariate_same_length.py b/convst/transformers/_multivariate_same_length.py
@@ -105,7 +105,7 @@ def M_SL_init_random_shapelet_params(
 @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, nogil=__USE_NUMBA_NOGIL__)
 def M_SL_generate_shapelet(
     X, y, n_shapelets, shapelet_sizes, r_seed, p_norm, p_min, p_max, alpha,
-    dist_func, use_phase, max_channels, prime_scheme
+    use_phase, max_channels, prime_scheme
 ):
     """
     Given a time series dataset and parameters of the method, generate the
@@ -131,9 +131,6 @@ def M_SL_generate_shapelet(
         Upper bound for the percentile during the choice of threshold
     alpha : float
         Alpha similarity parameter
-    dist_func: function
-        A distance function implemented with Numba taking two 1D vectors as
-        input.
     use_phase: bool
         Wheter to use phase invariance
     prime_scheme : bool
@@ -246,7 +243,7 @@ def M_SL_generate_shapelet(
                     #Compute distance vector
                     x_dist += compute_shapelet_dist_vector(
                         X[id_test, _channel_ids[k]], _v, _length, _dilation,
-                        dist_func, norm, use_phase
+                        norm, use_phase
                     )
 
                     _values[a3:b3] = _v
@@ -296,7 +293,7 @@ def M_SL_generate_shapelet(
 
 @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__)
 def M_SL_apply_all_shapelets(
-    X, shapelets, dist_func, use_phase
+    X, shapelets, use_phase
 ):
     """
     Apply a set of generated shapelet using the parameter arrays previously 
@@ -319,9 +316,6 @@ def M_SL_apply_all_shapelets(
             Threshold parameter of the shapelets
         normalize : array, shape=(n_shapelets)
             Normalization indicator of the shapelets
-    dist_func: function
-        A distance function implemented with Numba taking two 1D vectors as
-        input.
     use_phase: bool
         Wheter to use phase invariance
     
@@ -388,7 +382,7 @@ def M_SL_apply_all_shapelets(
 
                 X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \
                 apply_one_shapelet_one_sample_multivariate(
-                    strides[_channels], _values, threshold[i_shp], dist_func
+                    strides[_channels], _values, threshold[i_shp]
                 )
 
             _idx_norm = _idx_shp[where(normalize[_idx_shp] == True)[0]]
@@ -409,6 +403,6 @@ def M_SL_apply_all_shapelets(
 
                     X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \
                     apply_one_shapelet_one_sample_multivariate(
-                        strides[_channels], _values, threshold[i_shp], dist_func
+                        strides[_channels], _values, threshold[i_shp]
                     )
     return X_new
diff --git a/convst/transformers/_multivariate_variable_length.py b/convst/transformers/_multivariate_variable_length.py
@@ -105,7 +105,7 @@ def M_VL_init_random_shapelet_params(
 @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, nogil=__USE_NUMBA_NOGIL__)
 def M_VL_generate_shapelet(
     X, y, n_shapelets, shapelet_sizes, r_seed, p_norm, p_min, p_max, alpha,
-    dist_func, use_phase, max_channels, min_len, X_len, prime_scheme
+    use_phase, max_channels, min_len, X_len, prime_scheme
 ):
     """
     Given a time series dataset and parameters of the method, generate the
@@ -131,9 +131,6 @@ def M_VL_generate_shapelet(
         Upper bound for the percentile during the choice of threshold
     alpha : float
         Alpha similarity parameter
-    dist_func: function
-        A distance function implemented with Numba taking two 1D vectors as
-        input.
     use_phase: bool
         Wheter to use phase invariance
     min_len : int
@@ -276,7 +273,7 @@ def M_VL_generate_shapelet(
                     #Compute distance vector
                     x_dist += compute_shapelet_dist_vector(
                         X[id_test, _channel_ids[k], :X_len[id_test]], _v, _length, _dilation,
-                        dist_func, norm, use_phase
+                        norm, use_phase
                     )
 
                     _values[a3:b3] = _v
@@ -327,7 +324,7 @@ def M_VL_generate_shapelet(
 
 @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__)
 def M_VL_apply_all_shapelets(
-    X, shapelets, dist_func, use_phase, X_len
+    X, shapelets, use_phase, X_len
 ):
     """
     Apply a set of generated shapelet using the parameter arrays previously 
@@ -350,9 +347,6 @@ def M_VL_apply_all_shapelets(
             Threshold parameter of the shapelets
         normalize : array, shape=(n_shapelets)
             Normalization indicatorr of the shapelets
-    dist_func: function
-        A distance function implemented with Numba taking two 1D vectors as
-        input.
     use_phase: bool
         Wheter to use phase invariance
     X_len : array, shape=(n_samples)
@@ -419,7 +413,7 @@ def M_VL_apply_all_shapelets(
 
                 X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \
                 apply_one_shapelet_one_sample_multivariate(
-                    strides[_channels], _values, threshold[i_shp], dist_func
+                    strides[_channels], _values, threshold[i_shp]
                 )
 
             _idx_norm = _idx_shp[where(normalize[_idx_shp] == True)[0]]
@@ -440,6 +434,6 @@ def M_VL_apply_all_shapelets(
 
                     X_new[i_sample, (n_features * i_shp):(n_features * i_shp + n_features)] = \
                     apply_one_shapelet_one_sample_multivariate(
-                        strides[_channels], _values, threshold[i_shp], dist_func
+                        strides[_channels], _values, threshold[i_shp]
                     )
     return X_new