From fe588667442943da936a43e689518cf7f0c7adc3 Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Tue, 19 Jul 2022 10:37:35 +0200 Subject: [PATCH] Updating dependencies, removing experimental from main --- convst/experimental/G_RDST.py | 600 ---------------------- convst/experimental/RDST_no_norm.py | 296 ----------- convst/experimental/cst_c_ensemble.py | 125 ----- convst/experimental/cst_c_random.py | 103 ---- convst/experimental/dtw.py | 252 --------- convst/experimental/input_transformers.py | 154 ------ convst/experimental/rotationforest.py | 484 ----------------- convst/experimental/visualisations.py | 73 --- setup.py | 9 +- 9 files changed, 4 insertions(+), 2092 deletions(-) delete mode 100644 convst/experimental/G_RDST.py delete mode 100644 convst/experimental/RDST_no_norm.py delete mode 100644 convst/experimental/cst_c_ensemble.py delete mode 100644 convst/experimental/cst_c_random.py delete mode 100644 convst/experimental/dtw.py delete mode 100644 convst/experimental/input_transformers.py delete mode 100644 convst/experimental/rotationforest.py delete mode 100644 convst/experimental/visualisations.py diff --git a/convst/experimental/G_RDST.py b/convst/experimental/G_RDST.py deleted file mode 100644 index 0f07ec1..0000000 --- a/convst/experimental/G_RDST.py +++ /dev/null @@ -1,600 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Nov 18 13:17:07 2021 - -@author: a694772 -""" -from numba import njit, prange -import numpy as np -import seaborn as sns -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_is_fitted, check_random_state -from convst.utils.checks_utils import check_array_3D, check_array_1D -from convst.utils.shapelets_utils import generate_strides_1D -import pandas as pd -from matplotlib import pyplot as plt - -""" -from convst.utils.dataset_utils import load_sktime_dataset_split -X, Xt, y, yt, _ = load_sktime_dataset_split("StandWalkJump", normalize=True) -X = X.astype(np.float64) -Xt = Xt.astype(np.float64) -a = FR_DST(n_shapelets=2).fit(X, y) -b = a.transform(X) -bt = a.transform(Xt) -from sklearn.linear_model import RidgeClassifierCV -from sklearn.preprocessing import StandardScaler -from sklearn.metrics import accuracy_score -from sklearn.pipeline import make_pipeline -r = make_pipeline( - StandardScaler(with_mean=False), - RidgeClassifierCV(alphas=np.logspace(-6,6,20)) -).fit(b, y) -p = r.predict(bt) -print(accuracy_score(yt, p)) -""" - - - -@njit(fastmath=True, cache=True, error_model='numpy') -def compute_shapelet_dist_vector(x, values, length, dilation, normalize): - """ - Compute a shapelet distance vector from an univariate time series - and a dilated shapelet. - - Parameters - ---------- - x : array, shape=(n_timestamps) - An input time series - values : array, shape=(max_shapelet_length) - The value array of the shapelet - length : int - Length of the shapelet - dilation : int - Dilation of the shapelet - normalize : float - Boolean converted as float (to avoid if statement) indicating - wheter or not the distance will be z-normalized - - Returns - ------- - x_conv : array, shape=(n_timestamps - (length-1) * dilation) - The resulting distance vector - - """ - c = generate_strides_1D(x, length, dilation) - x_conv = np.empty(c.shape[0]) - for i in range(x_conv.shape[0]): - s = 0 - mean = c[i].mean()*normalize - std = (c[i].std()+1e-8)*normalize - x0 = (c[i] - mean)/(std + 1.0-normalize) - for j in range(length): - s += abs(x0[j] - values[j]) - x_conv[i] = s - return x_conv - - -@njit(cache=True, parallel=True) -def _init_random_shapelet_params(n_shapelets, shapelet_sizes, n_timestamps, p_norm, - n_features, max_channels): - """ - Initialize the parameters of the shapelets. - - Parameters - ---------- - n_shapelets : int - Number of shapelet to initialize - shapelet_sizes : array, shape=() - Set of possible length for the shapelets - n_timestamps : int - Number of timestamps in the input data - p_norm : float - A value in the range [0,1] indicating the chance for each - shapelet to use z-normalized distance - - Returns - ------- - values : array, shape=(n_shapelet, max(shapelet_sizes)) - An initialized (empty) value array for each shapelet - lengths : array, shape=(n_shapelet) - The randomly initialized length of each shapelet - dilations : array, shape=(n_shapelet) - The randomly initialized dilation of each shapelet - threshold : array, shape=(n_shapelet) - An initialized (empty) value array for each shapelet - normalize : array, shape=(n_shapelet) - The randomly initialized normalization indicator of each shapelet - - """ - - # Lengths of the shapelets - lengths = np.random.choice(shapelet_sizes, size=n_shapelets).astype(np.int64) - - # Channels (features to consider) - channels = np.zeros((n_shapelets, n_features), dtype=np.bool_) - n_channels = np.random.choice(np.arange(1,max_channels+1), size=n_shapelets) - - # Dilations - upper_bounds = np.log2(np.floor_divide(n_timestamps - 1, lengths - 1)) - powers = np.empty(n_shapelets) - for i in prange(n_shapelets): - powers[i] = np.random.uniform(0, upper_bounds[i]) - - i_channels = np.random.choice( - n_features, size=n_channels[i], replace=False - ) - for j in i_channels: - channels[i, j] = True - - - dilations = np.floor(np.power(2, powers)).astype(np.int64) - - # Init threshold array - threshold = np.zeros(n_shapelets, dtype=np.float64) - - # Init values array - values = np.zeros( - (n_shapelets, n_features, np.int64(np.max(shapelet_sizes))), - dtype=np.float64) - - # Is shapelet using z-normalization ? - normalize = np.random.random(size=n_shapelets) - normalize = (normalize < p_norm).astype(np.float64) - - return values, lengths, dilations, threshold, normalize, channels - - -@njit(cache=True, fastmath=True, error_model='numpy') -def _get_subsequence(X, i_start, l, d, normalize): - """ - Given a set of length and dilation, fetch a subsequence from an input - time series given a starting index. - - Parameters - ---------- - X : array, shape=(n_timestamps) - Input time series. - i_start : int - A starting index between [0, n_timestamps - (l-1)*d] - l : int - Length parameter. - d : int - Dilation parameter. - normalize : float - Boolean converted as float (to avoid if statement) indicating - wheter or not the distance will be z-normalized - - Returns - ------- - v : array, shape=(l) - The resulting subsequence. - - """ - v = np.empty(l, dtype=np.float64) - _idx = i_start - _sum = 0 - _sum2 = 0 - for j in range(l): - v[j] = X[_idx] - _sum += X[_idx] - _sum2 += X[_idx]**2 - _idx += d - #0 if normalize, seems faster than adding a if statement - mean = (_sum/l)*normalize - _std = (_sum2/l) - (mean**2) - if _std < 0: - _std = 0 - std = (np.sqrt(_std)+1e-8)*normalize - # divided by 1 if non normalized - v = (v - mean)/(std + 1-normalize) - return v - - -@njit(cache=True, parallel=True) -def generate_shapelet(X, y, n_shapelets, shapelet_sizes, - seed, p_norm, p_min, p_max, X_len, max_channels): - """ - Given a time series dataset and parameters of the method, generate the - set of random shapelet that will be used in the rest of the algorithm. - - Parameters - ---------- - X : array, shape=(n_samples, n_features, n_timestamps) - Time series dataset - y : array, shape=(n_samples) - Class of each input time series - n_shapelets : int - Number of shapelet to generate - shapelet_sizes : array, shape=() - An array of possible shapelet length. - seed : int - Random seed generator for numpy - p_norm : float - Probability of each shapelet to use z-normalized distance - p_min : float - Lower bound for the percentile during the choice of threshold - p_max : float - Upper bound for the percentile during the choice of threshold - X_len : array, shape=(n_samples) - Real number of timestamp of each input time series - - Returns - ------- - set of arrays - Return the values, lengths, dilations, thresholds and normalization - indicators as array with first dimension of size (n_shapelets) - """ - n_samples, n_features, _ = X.shape - - # Fix the random see - np.random.seed(seed) - - values, lengths, dilations, threshold, normalize, channels = _init_random_shapelet_params( - n_shapelets, shapelet_sizes, X_len.min(), p_norm, n_features, max_channels - ) - - samples_pool = np.arange(X.shape[0]).astype(np.int64) - np.random.shuffle(samples_pool) - # For Values, draw from random uniform (0,n_samples*(n_ts-(l-1)*d)) - # for each l,d combinations. Then take by index the values instead - # of generating strides. - for i in prange(n_shapelets): - id_sample = samples_pool[i % X.shape[0]] - index = np.int64(np.random.choice( - X_len[id_sample] - (lengths[i]-1)*dilations[i] - )) - id_test = np.random.choice(np.where(y == y[id_sample])[0]) - - x_dist = np.zeros(X_len[id_test] - (lengths[i]-1)*dilations[i]) - for j in range(n_features): - if channels[i,j]: - values[i, j, :lengths[i]] = _get_subsequence( - X[id_sample, j, :X_len[id_sample]], - index, lengths[i], dilations[i], normalize[i] - ) - x_dist += compute_shapelet_dist_vector( - X[id_test, j, :X_len[id_test]], - values[i,j], lengths[i], dilations[i], normalize[i] - ) - - threshold[i] = np.random.uniform( - np.percentile(x_dist, p_min), np.percentile(x_dist, p_max) - ) - - return values, lengths, dilations, threshold, normalize.astype(np.int64), channels - - -@njit(cache=True, parallel=True, fastmath=True, error_model='numpy') -def apply_all_shapelets(X, values, lengths, dilations, threshold, - normalize, channels, X_len): - """ - Apply a set of generated shapelet using the parameter arrays previously - generated to a set of time series. - - Parameters - ---------- - X : array, shape=(n_samples, n_features, n_timestamps) - Input time series - values : array, shape=(n_shapelets, max(shapelet_sizes)) - Values of the shapelets. If the shapelet use z-normalized distance, - those values are already z-normalized by the shapelet - initialization step. - lengths : array, shape=(n_shapelets) - Length parameter of the shapelets - dilations : array, shape=(n_shapelets) - Dilation parameter of the shapelets - threshold : array, shape=(n_shapelets) - Threshold parameter of the shapelets - normalize : array, shape=(n_shapelets) - Normalization indicator of the shapelets - channels : array, shape=(n_shapelets, n_features) - Features targeted by each shapelet - X_len : array, shape=(n_samples) - Real number of timestamp of each input time series - - Returns - ------- - X_new : array, shape=(n_samples, 3*n_shapelets) - The transformed input time series with each shapelet extracting 3 - feature from the distance vector computed on each time series. - - """ - n_samples, n_ft, _ = X.shape - n_shapelets = len(lengths) - n_features = 3 - - unique_lengths = np.unique(lengths) - unique_dilations = np.unique(dilations) - - X_new = np.empty((n_samples, n_features * n_shapelets), dtype=np.float64) - for index_l in range(unique_lengths.shape[0]): - l = unique_lengths[index_l] - for index_d in prange(unique_dilations.shape[0]): - d = unique_dilations[index_d] - ix_shapelets = np.where((lengths == l) & (dilations == d))[0] - if len(ix_shapelets) > 0: - for i in prange(n_samples): - d_shape = X_len[i] - (l-1)*d - X_dist = np.zeros((len(ix_shapelets), d_shape)) - for i_ft in range(n_ft): - - X_sample = np.zeros((2, d_shape, l)) - strides = generate_strides_1D(X[i, i_ft, :X_len[i]], l, d) - X_sample[0] = strides - X_sample[1] = strides - for j in range(d_shape): - X_sample[1, j] = (X_sample[1, j] - np.mean(X_sample[1, j]))/( - np.std(X_sample[1, j])+1e-8 - ) - - for j in range(len(ix_shapelets)): - i_shp = ix_shapelets[j] - if channels[i_shp,i_ft]: - X_dist[j] += _get_dist_vect(X_sample[normalize[i_shp]], values[i_shp, i_ft]) - - for j in range(len(ix_shapelets)): - i_shp = ix_shapelets[j] - X_new[i, (n_features * i_shp):(n_features * i_shp + n_features)] = extract_features( - X_dist[j], threshold[i_shp] - ) - return X_new - -@njit(fastmath=True, cache=True, error_model='numpy') -def _get_dist_vect(x, values): - n_candidates, length = x.shape - x_dist = np.zeros(n_candidates) - #For each step of the moving window in the shapelet distance - for i in range(n_candidates): - _dist = 0 - #For each value of the shapelet - for j in prange(length): - _dist += abs(x[i, j] - values[j]) - x_dist[i] += _dist - return x_dist - -@njit(fastmath=True, cache=True) -def extract_features(x, threshold): - """ - Extract the three features from the distance between a shapelet and the - strides of an input time series generated by the length and dilation - parameter of the shapelet. If normalization should be used, all strides - of should be z-normalized independently. - - Parameters - ---------- - x : array, shape=(n_timestamps - (length-1)*dilation, length) - Strides of an input time series - values : array, shape=(max(shapelet_sizes)) - Values of the shapelet - threshold : float - The threshold to compute the shapelet occurence feature. - - Returns - ------- - _min : float - The minimum euclidean distance between the shapelet and the input time - series - float - The location of the minimum euclidean distance divided by the length - of the distance vector (i.e scaled between [0,1]). - float - The number of points in the distance vector inferior to the threshold - divided by the length of the distance vector (i.e scaled between [0,1]) - - """ - - return np.min(x), np.argmin(x)/x.shape[0], np.sum(x {})." - .format(shapelet_sizes.max(), n_timestamps//2)) - """ - rng = check_random_state(self.random_state) - seed = rng.randint(np.iinfo(np.uint32).max, dtype='u8') - - return shapelet_sizes, seed - - def _get_shp_params(self, id_shp): - #values, length, dilation, padding, range - return (self.values_[id_shp], self.length_[id_shp], - self.dilation_[id_shp], self.threshold_[id_shp], - self.normalize_[id_shp]) - - def visualise_one_shapelet(self, id_shp, X, y, target_class,figs=(15, 10)): - # For visualisation, if argmin is important, draw a bar on x axis - # If min, highligh on series (red) - # If #match, hihgligh all parts which match on series (blue) - sns.set() - sns.set_context('talk') - fig, ax = plt.subplots(ncols=3, nrows=2, figsize=figs) - values, length, dilation, r, norm = self._get_shp_params(id_shp) - values = values[:length] - X_new = np.zeros((X.shape[0], 3)) - yc = (y == target_class).astype(int) - for i in range(X.shape[0]): - x_dist = compute_shapelet_dist_vector( - X[i, 0], values, length, dilation, norm) - X_new[i, 0] = np.min(x_dist) - X_new[i, 1] = np.argmin(x_dist) - X_new[i, 2] = np.mean(x_dist < r) - - sns.boxplot(x=yc, y=X_new[:, 0], ax=ax[0, 0]) - sns.boxplot(x=yc, y=X_new[:, 1], ax=ax[0, 1]) - sns.boxplot(x=yc, y=X_new[:, 2], ax=ax[0, 2]) - - i0 = np.random.choice(np.where(yc == 0)[0]) - i1 = np.random.choice(np.where(yc == 1)[0]) - ax[1, 1].scatter(np.arange(length)*dilation, values) - ax[1, 1].plot(np.arange(length)*dilation, values, linestyle='--') - ax[1, 2].plot(compute_shapelet_dist_vector(X[i1, 0], values, length, dilation, norm), - c='C1', alpha=0.75, label='distance vector of sample of class {}'.format(target_class)) - ax[1, 2].plot(compute_shapelet_dist_vector(X[i0, 0], values, length, dilation, norm), - c='C0', alpha=0.75, label='distance vector of sample of class {}'.format(y[i0])) - ax[0, 0].set_xticks([0, 1]) - ax[0, 0].set_xticklabels( - ['other classes', 'class {}'.format(target_class)]) - ax[0, 1].set_xticks([0, 1]) - ax[0, 1].set_xticklabels( - ['other classes', 'class {}'.format(target_class)]) - ax[0, 2].set_xticks([0, 1]) - ax[0, 2].set_xticklabels( - ['other classes', 'class {}'.format(target_class)]) - ax[1, 0].set_xlabel('timestamps') - ax[1, 1].set_xlabel('timestamps') - ax[1, 2].set_xlabel('timestamps') - ix_start = X_new[i0, 1] - ix = np.arange(ix_start, ix_start+length) - if norm == 1: - ix = np.zeros(length) - for i in range(length): - ix[i] = ix_start + (i*dilation) - ix = ix.astype(int) - v = values[:length] * X[i0, 0, ix].std() + X[i0, 0, ix].mean() - else: - v = values[:length] - ax[1, 0].scatter(ix, v, c='C0', alpha=0.75) - - ix_start = X_new[i1, 1] - ix = np.arange(ix_start, ix_start+length) - if norm == 1: - ix = np.zeros(length) - for i in range(length): - ix[i] = ix_start + (i*dilation) - ix = ix.astype(int) - v = values[:length] * X[i1, 0, ix].std() + X[i1, 0, ix].mean() - else: - v = values[:length] - ax[1, 0].scatter(ix, v, c='C1', alpha=0.75) - - ax[1, 2].axhline(r, c='C2', linestyle='--') - """ - ax[i,1].set_xticks(np.arange(length), rotation=70) - ax[i,1].set_xticklabels(np.arange(length)*dilation, rotation=70) - ax[i,2].plot(x_conv, color = 'C'+str(i)) - ax[i,2].axhline(r) - """ - ax[1, 0].plot(X[i1, 0], c='C1', alpha=0.75, - label='sample of class {}'.format(target_class)) - ax[1, 0].plot(X[i0, 0], c='C0', alpha=0.75, - label='sample of class {}'.format(y[i0])) - - ax[0, 0].set_title("Boxplot of min") - ax[1, 0].set_title("Location of the minimum") - ax[0, 1].set_title("Boxplot of argmin") - ax[1, 1].set_title("Shapelet n°{} (d={})".format(id_shp, dilation)) - ax[0, 2].set_title("Boxplot of shapelet occurences") - ax[1, 2].set_title("Distance vector and lambda threshold") - #ax[2,1].set_title("0 : {}; 1 : {}".format(str(X_new[i0,2])[0:5],str(X_new[i1,2])[0:5])) - ax[1, 0].legend() - ax[1, 1].legend() - #fig.suptitle("Shapelet l={}, d={}, n={}".format(length,dilation,norm)) - plt.show() diff --git a/convst/experimental/RDST_no_norm.py b/convst/experimental/RDST_no_norm.py deleted file mode 100644 index 0d93070..0000000 --- a/convst/experimental/RDST_no_norm.py +++ /dev/null @@ -1,296 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Nov 18 13:17:07 2021 - -@author: a694772 -""" -from numba import njit, prange -import numpy as np -import seaborn as sns -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_is_fitted, check_random_state -from convst.utils.checks_utils import check_array_3D, check_array_1D -from convst.utils.shapelets_utils import generate_strides_1D -from matplotlib import pyplot as plt - -@njit(fastmath=True, cache=True, error_model='numpy') -def compute_shapelet_dist_vector(x, values, length, dilation): - c = generate_strides_1D(x, length, dilation) - x_conv = np.empty(c.shape[0]) - for i in range(x_conv.shape[0]): - s = 0 - for j in range(length): - s += abs(c[i,j] - values[j]) - x_conv[i] = s - return x_conv - - -@njit(cache=True, parallel=True) -def _init_random_shapelet_params(n_shapelets, shapelet_sizes, n_timestamps): - # Lengths of the shapelets - lengths = np.random.choice(shapelet_sizes, size=n_shapelets).astype(np.int64) - - # Dilations - upper_bounds = np.log2(np.floor_divide(n_timestamps - 1, lengths - 1)) - powers = np.empty(n_shapelets) - for i in prange(n_shapelets): - powers[i] = np.random.uniform(0, upper_bounds[i]) - dilations = np.floor(np.power(2, powers)).astype(np.int64) - - # Init ranges array - ranges = np.zeros(n_shapelets,dtype=np.float64) - - # Init values array - values = np.zeros((n_shapelets, np.int64(np.max(shapelet_sizes))), - dtype=np.float64) - - return values, lengths, dilations, ranges - -@njit(cache=True, fastmath=True, error_model='numpy') -def _get_subsequence(X, i_start, l, d): - v = np.empty(l,dtype=np.float64) - _idx = i_start - for j in range(l): - v[j] = X[_idx] - _idx += d - return v - -@njit(cache=True, parallel=True) -def generate_shapelet(X, y, n_shapelets, shapelet_sizes, seed, p_min, p_max): - n_samples, n_features, n_timestamps = X.shape - - # Fix the random see - np.random.seed(seed) - - values, lengths, dilations, ranges = _init_random_shapelet_params( - n_shapelets, shapelet_sizes, n_timestamps - ) - - samples_pool = np.arange(X.shape[0]).astype(np.int64) - np.random.shuffle(samples_pool) - # For Values, draw from random uniform (0,n_samples*(n_ts-(l-1)*d)) - # for each l,d combinations. Then take by index the values instead - # of generating strides. - for i in prange(n_shapelets): - id_sample = samples_pool[i%X.shape[0]] - index = np.int64(np.random.choice( - n_timestamps - (lengths[i]-1)*dilations[i] - )) - v = _get_subsequence( - X[id_sample,0], index, lengths[i],dilations[i] - ) - - values[i, :lengths[i]] = v - - id_test=np.random.choice(np.where(y==y[id_sample])[0]) - - x_dist = compute_shapelet_dist_vector( - X[id_test,0], values[i], lengths[i], dilations[i] - ) - ranges[i] = np.random.uniform( - np.percentile(x_dist,p_min), np.percentile(x_dist,p_max) - ) - - return values, lengths, dilations, ranges - - -@njit(cache=True, parallel=True, fastmath=True, error_model='numpy') -def apply_all_shapelets(X, values, lengths, dilations, ranges): - n_samples, n_ft, n_timestamps = X.shape - n_shapelets = len(lengths) - n_features = 3 - - unique_lengths = np.unique(lengths) - unique_dilations = np.unique(dilations) - - X_new = np.empty((n_samples, n_features * n_shapelets), dtype=np.float64) - for index_l in range(unique_lengths.shape[0]): - l=unique_lengths[index_l] - for index_d in prange(unique_dilations.shape[0]): - d = unique_dilations[index_d] - ix_shapelets = np.where((lengths==l)&(dilations==d))[0] - if len(ix_shapelets)>0: - for i in prange(n_samples): - strides = generate_strides_1D(X[i,0], l, d) - for j in prange(len(ix_shapelets)): - i_shp = ix_shapelets[j] - X_new[i, (n_features * i_shp):(n_features * i_shp + n_features)] = apply_one_shapelet_one_sample( - strides, values[i_shp], ranges[i_shp] - ) - return X_new - -@njit(fastmath=True, cache=True) -def apply_one_shapelet_one_sample(x, values, r): - n_candidates, length = x.shape - - _n_match = 0 - _min = 1e+100 - _argmin = 0 - - #For each step of the moving window in the shapelet distance - for i in range(n_candidates): - _dist = 0 - #For each value of the shapelet - for j in prange(length): - _dist += abs(x[i,j] - values[j]) - - if _dist < _min: - _min = _dist - _argmin = i - - if _dist < r: - _n_match += 1 - - return _min, _argmin/n_candidates, _n_match/n_candidates - -class R_DST_NN(BaseEstimator, TransformerMixin): - def __init__(self, n_shapelets=10000, shapelet_sizes=[11], - random_state=None, percentiles=[5,10]): - self.n_shapelets = n_shapelets - self.shapelet_sizes = np.asarray(shapelet_sizes) - self.random_state = random_state - self.percentiles = percentiles - - def fit(self, X, y=None): - X = check_array_3D(X).astype(np.float64) - n_samples, n_features, n_timestamps = X.shape - if self.shapelet_sizes.dtype == float: - self.shapelet_sizes = np.floor(n_timestamps*self.shapelet_sizes) - for i in range(self.shapelet_sizes.shape[0]): - if self.shapelet_sizes[i] < 5: - self.shapelet_sizes[i] = 5 - shapelet_sizes, seed = self._check_params(n_timestamps) - # Generate the shapelets - - values, lengths, dilations, ranges = generate_shapelet( - X, y, self.n_shapelets, shapelet_sizes, seed, - self.percentiles[0], self.percentiles[1] - ) - self.values_ = values - self.length_ = lengths - self.dilation_ = dilations - self.ranges_ = ranges - - return self - - def transform(self, X): - X = check_array_3D(X).astype(np.float64) - check_is_fitted(self, ['values_', 'length_', 'dilation_', 'ranges_']) - X_new = apply_all_shapelets( - X, self.values_, self.length_, self.dilation_, self.ranges_ - ) - return X_new - - def _check_params(self, n_timestamps): - if not isinstance(self.n_shapelets, (int, np.integer)): - raise TypeError("'n_shapelets' must be an integer (got {})." - .format(self.n_shapelets)) - - if not isinstance(self.shapelet_sizes, (list, tuple, np.ndarray)): - raise TypeError("'shapelet_sizes' must be a list, a tuple or " - "an array (got {}).".format(self.shapelet_sizes)) - shapelet_sizes = check_array_1D(self.shapelet_sizes).astype(np.int64) - if not np.all(1 <= shapelet_sizes): - raise ValueError("All the values in 'shapelet_sizes' must be " - "greater than or equal to 1 ({} < 1)." - .format(shapelet_sizes.min())) - if not np.all(shapelet_sizes <= n_timestamps//2): - shapelet_sizes = shapelet_sizes[shapelet_sizes <= n_timestamps//2] - if shapelet_sizes.shape[0] == 0: - shapelet_sizes = np.array([3]) - """ - raise ValueError("All the values in 'shapelet_sizes' must be lower " - "than or equal to 'n_timestamps//2' ({} > {})." - .format(shapelet_sizes.max(), n_timestamps//2)) - """ - rng = check_random_state(self.random_state) - seed = rng.randint(np.iinfo(np.uint32).max, dtype='u8') - - return shapelet_sizes, seed - - - def _get_shp_params(self, id_shp): - #values, length, dilation, padding, range - return (self.values_[id_shp], self.length_[id_shp], - self.dilation_[id_shp], self.ranges_[id_shp], - self.normalize_[id_shp]) - - def visualise_one_shapelet(self, id_shp, X, y, target_class): - # For visualisation, if argmin is important, draw a bar on x axis - # If min, highligh on series (red) - # If #match, hihgligh all parts which match on series (blue) - sns.set() - sns.set_context('talk') - fig, ax = plt.subplots(ncols=2, nrows=3, figsize=(15,15)) - values, length, dilation, r, norm = self._get_shp_params(id_shp) - X_new = np.zeros((X.shape[0], 3)) - yc = (y==target_class).astype(int) - for i in range(X.shape[0]): - x_dist = compute_shapelet_dist_vector(X[i,0], values, length, dilation, norm) - X_new[i,0] = np.min(x_dist) - X_new[i,1] = np.argmin(x_dist) - X_new[i,2] = np.mean(x_dist < r) - - sns.boxplot(x=yc, y=X_new[:,0], ax=ax[0,0]) - sns.boxplot(x=yc, y=X_new[:,1], ax=ax[1,0]) - sns.boxplot(x=yc, y=X_new[:,2], ax=ax[2,0]) - - i0 = np.random.choice(np.where(yc==0)[0]) - i1 = np.random.choice(np.where(yc==1)[0]) - ax[0,1].plot(X[i1, 0], c='C1', alpha=0.75, label='sample of class {}'.format(target_class)) - ax[0,1].plot(X[i0, 0], c='C0', alpha=0.75, label='sample of class {}'.format(y[i0])) - ax[1,1].plot(X[i1, 0], c='C1', alpha=0.75, label='sample of class {}'.format(target_class)) - ax[1,1].plot(X[i0, 0], c='C0', alpha=0.75, label='sample of class {}'.format(y[i0])) - ax[2,1].plot(compute_shapelet_dist_vector(X[i1,0], values, length, dilation, norm), c='C1', alpha=0.75, label='distance vector of sample of class {}'.format(target_class)) - ax[2,1].plot(compute_shapelet_dist_vector(X[i0,0], values, length, dilation, norm), c='C0', alpha=0.75, label='distance vector of sample of class {}'.format(y[i0])) - ax[0,1].legend() - ax[1,1].legend() - ax[2,1].legend() - ax[0,0].set_xticks([0,1]) - ax[0,0].set_xticklabels(['other classes','class {}'.format(target_class)]) - ax[1,0].set_xticks([0,1]) - ax[1,0].set_xticklabels(['other classes','class {}'.format(target_class)]) - ax[2,0].set_xticks([0,1]) - ax[2,0].set_xticklabels(['other classes','class {}'.format(target_class)]) - - ix_start = X_new[i0, 1] - if norm == 1: - ix = np.zeros(length) - for i in range(length): - ix[i] = ix_start + (i*dilation) - ix = ix.astype(int) - v = values[:length] * X[i0, 0, ix].std() + X[i0, 0, ix].mean() - else: - v = values[:length] - ax[0,1].scatter(ix, v, c='C0', alpha=0.75) - - ix_start = X_new[i1, 1] - if norm == 1: - ix = np.zeros(length) - for i in range(length): - ix[i] = ix_start + (i*dilation) - ix = ix.astype(int) - v = values[:length] * X[i1, 0, ix].std() + X[i1, 0, ix].mean() - else: - v = values[:length] - ax[0,1].scatter(ix, v, c='C1', alpha=0.75) - - ax[1,1].scatter(int(X_new[i0,1]), X[i0,0, int(X_new[i0,1])], c='C0', alpha=0.75) - ax[1,1].scatter(int(X_new[i1,1]), X[i1,0, int(X_new[i1,1])], c='C1', alpha=0.75) - - ax[2,1].axhline(r, c='C2', linestyle='--') - """ - ax[i,1].set_xticks(np.arange(length), rotation=70) - ax[i,1].set_xticklabels(np.arange(length)*dilation, rotation=70) - ax[i,2].plot(x_conv, color = 'C'+str(i)) - ax[i,2].axhline(r) - """ - ax[0,0].set_title("Min") - ax[0,1].set_title("0 : {}; 1 : {}".format(str(X_new[i0,0])[0:5],str(X_new[i1,0])[0:5])) - ax[1,0].set_title("Argmin") - ax[1,1].set_title("0 : {}; 1 : {}".format(str(X_new[i0,1])[0:5],str(X_new[i1,1])[0:5])) - ax[2,0].set_title("# Match") - ax[2,1].set_title("0 : {}; 1 : {}".format(str(X_new[i0,2])[0:5],str(X_new[i1,2])[0:5])) - - fig.suptitle("Shapelet l={}, d={}, n={}".format(length,dilation,norm)) - plt.show() \ No newline at end of file diff --git a/convst/experimental/cst_c_ensemble.py b/convst/experimental/cst_c_ensemble.py deleted file mode 100644 index 3611a5e..0000000 --- a/convst/experimental/cst_c_ensemble.py +++ /dev/null @@ -1,125 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Dec 10 15:00:13 2021 - -@author: a694772 - -from sklearn.metrics import accuracy_score, make_scorer - -import numpy as np - -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.utils.validation import check_is_fitted - -from sklearn.pipeline import make_pipeline -from sklearn.linear_model import RidgeClassifierCV -from sklearn.ensemble import RandomForestClassifier - -from convst.transformers import Raw, CST_Random -from joblib import Parallel, delayed -from numba import set_num_threads - -#from convst.classifiers.rotationforest import RotationForest -from sklearn.base import clone -from sklearn.model_selection import cross_val_score - -def _parallel_fit(estimator, X, y, n_thread_per_job, nn_cv=None): - set_num_threads(n_thread_per_job) - X_cv = estimator[:-1].fit_transform(X, y) - estimator[-1].fit(X_cv, y) - n_cv = max(2,np.bincount(y).max()) - if nn_cv: - score = np.mean(cross_val_score( - clone(estimator[-1]), X_cv, y, cv=min(n_cv,nn_cv), - scoring=make_scorer(accuracy_score), - n_jobs=min(n_cv, n_thread_per_job)) - ) - else: - score = 1 - return estimator, score - -def _parallel_predict(estimator, X, n_thread_per_job): - set_num_threads(n_thread_per_job) - return estimator.predict_proba(X) - -class RidgeClassifierCV_Proba(RidgeClassifierCV): - def predict_proba(self, X): - d = self.decision_function(X) - n_classes = self.classes_.size - d = np.exp(d) / (1+np.exp(d)) - if n_classes == 2: - return np.vstack((1-d, d)).T - return d - -class CST_EC(BaseEstimator, ClassifierMixin): - def __init__(self, n_shapelets=10000, transformations=[Raw()], - shapelet_sizes=[7,9,11], random_state=None, - n_jobs=1, n_thread_per_job=1, - P=98, add_padding=True, BaseClassifier=None, - z_norm_input=True): - self.n_shapelets = n_shapelets - self.shapelet_sizes = shapelet_sizes - self.random_state = random_state - if BaseClassifier is None : - self.BaseClassifier = RandomForestClassifier( - n_estimators=200, ccp_alpha=0.005, n_jobs=n_thread_per_job - ) - - else: - self.BaseClassifier = BaseClassifier - self.transformations = np.asarray(transformations) - self.n_jobs = n_jobs - self.n_thread_per_job = n_thread_per_job - self.P = P - self.add_padding = add_padding - self.z_norm_input = z_norm_input - - def _initialize_estimator_pool(self): - n_trans = self.transformations.size - estimators = np.empty(n_trans, dtype=object) - n_shapelets_per_estimator = self.n_shapelets // n_trans - for i in range(n_trans): - estimators[i] = make_pipeline( - clone(self.transformations[i]), - clone(CST_Random(n_shapelets=n_shapelets_per_estimator, - shapelet_sizes=self.shapelet_sizes, - P = self.P, z_norm_input=self.z_norm_input, - add_padding = self.add_padding)), - clone(self.BaseClassifier) - ) - return estimators - - def fit(self, X, y, n_cv=10): - estimators = self._initialize_estimator_pool() - estimators = np.asarray(Parallel( - n_jobs=self.n_jobs, - )( - delayed(_parallel_fit)( - e, X, y, self.n_thread_per_job, nn_cv=n_cv - ) - for i, e in enumerate(estimators) - ),dtype=object) - - self.estimators_ = estimators[:,0] - self.cv_scores_ = estimators[:,1].astype(float) - return self - - def predict(self, X, alpha=4): - check_is_fitted(self, ['estimators_', 'cv_scores_']) - preds = np.asarray(Parallel( - n_jobs=self.n_jobs, - )( - delayed(_parallel_predict)( - e, X, self.n_thread_per_job - ) - for i, e in enumerate(self.estimators_) - )) - preds = preds ** alpha - #n_estim - for i in range(preds.shape[0]): - preds[i,:] *= self.cv_scores_[i] - return np.argmax(preds.sum(axis=0),axis=1) - - def get_shapelet_importances(self): - pass -""" \ No newline at end of file diff --git a/convst/experimental/cst_c_random.py b/convst/experimental/cst_c_random.py deleted file mode 100644 index e1a9d4f..0000000 --- a/convst/experimental/cst_c_random.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Dec 10 15:00:13 2021 -DSTEC -@author: a694772 - -import numpy as np - -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.utils.validation import check_is_fitted -from sklearn.pipeline import make_pipeline -from sklearn.base import clone -from sklearn.linear_model import RidgeClassifierCV -from sklearn.preprocessing import StandardScaler - -from convst.transformers import Raw, R_DST, FourrierCoefs, Derivate -from joblib import Parallel, delayed -from numba import set_num_threads - -def _parallel_transformer_fit(transformer, X, y, n_thread_per_job): - set_num_threads(n_thread_per_job) - return transformer.fit(X, y) - -def _parallel_transformer_transform(transformer, X, n_thread_per_job): - set_num_threads(n_thread_per_job) - return transformer.transform(X) - -class CST_C_Random(BaseEstimator, ClassifierMixin): - def __init__(self, - input_dict = { - '0': - {'transformer':Raw(),'n_shapelets':8000, - 'p_norm':0.9, 'shapelet_sizes':[7,9,11]}, - '1': - {'transformer':FourrierCoefs(),'n_shapelets':1000, - 'p_norm':0.1, 'shapelet_sizes':[3,5,7]}, - '2': - {'transformer':Derivate(),'n_shapelets':1000, - 'p_norm':0.75, 'shapelet_sizes':[7,9,11]}, - }, - random_state=None, - n_jobs=1, n_thread_per_job=1, - BaseClassifier=None, - ): - - self.input_dict = input_dict - self.random_state = random_state - if BaseClassifier is None : - self.BaseClassifier = make_pipeline( - StandardScaler(with_mean=False), - RidgeClassifierCV(alphas=np.logspace(-6,6,20)) - ) - else: - self.BaseClassifier = BaseClassifier - self.n_jobs = n_jobs - self.n_thread_per_job = n_thread_per_job - - def _initialize_transformer_pool(self): - keys = list(self.input_dict.keys()) - n_trans = len(keys) - transformers = np.empty(n_trans, dtype=object) - for i in range(n_trans): - key = keys[i] - transformers[i] = make_pipeline( - clone(self.input_dict[key]['transformer']), - clone(R_DST(n_shapelets=self.input_dict[key]['n_shapelets'], - shapelet_sizes=self.input_dict[key]['shapelet_sizes'], - p_norm=self.input_dict[key]['p_norm'])) - ) - return transformers - - def fit(self, X, y): - transformers = self._initialize_transformer_pool() - transformers = Parallel( - n_jobs=self.n_jobs, - )( - delayed(_parallel_transformer_fit)( - e, X, y, self.n_thread_per_job - ) - for e in transformers - ) - self.transformers = transformers - X = self.transform(X) - self.BaseClassifier.fit(X, y) - return self - - def transform(self, X): - check_is_fitted(self, ['BaseClassifier', 'transformers']) - X_new = Parallel( - n_jobs=self.n_jobs, - )( - delayed(_parallel_transformer_transform)( - e, X, self.n_thread_per_job - ) - for e in self.transformers - ) - return np.concatenate(X_new, axis=1) - - def predict(self, X, alpha=4): - check_is_fitted(self, ['BaseClassifier', 'transformers']) - X = self.transform(X) - return self.BaseClassifier.predict(X) -""" diff --git a/convst/experimental/dtw.py b/convst/experimental/dtw.py deleted file mode 100644 index ae17398..0000000 --- a/convst/experimental/dtw.py +++ /dev/null @@ -1,252 +0,0 @@ -import numpy as np -from numba import njit, prange -from scipy.stats import mode -from sklearn.metrics import accuracy_score - -__all__ = ['dtw_distance','dtw_distance_set','CF','CE','self_dtwcid_distance'] - -@njit(cache=True, fastmath=True) -def CF(Q,C): - ce_q = CE(Q) - ce_c = CE(C) - return max(ce_q, ce_c)/min(ce_q, ce_c) - -@njit(cache=True, fastmath=True) -def CE(Q): - _sum = 0 - for i in prange(Q.shape[0]-1): - _sum += (Q[i] - Q[i+1])**2 - return np.sqrt(_sum) - -@njit(parallel=True, nogil=True) -def self_dtwcid_distance(dataset1): - """ - Computes the dataset DTW distance matrix using multiprocessing. - Args: - dataset1: timeseries dataset of shape [N1, T1] - dataset2: timeseries dataset of shape [N2, T2] - Returns: - Distance matrix of shape [N1, N2] - """ - n1 = dataset1.shape[0] - dist = np.empty((n1, n1), dtype=np.float64) - for i in prange(n1): - for j in prange(i+1,n1): - d = dtw_distance(dataset1[i], dataset1[j]) - c = CF(dataset1[i], dataset1[j]) - d *= c - dist[i,j] = d - dist[j,i] = d - return dist - -@njit(parallel=True, nogil=True) -def dtw_distance_set(dataset1, dataset2): - """ - Computes the dataset DTW distance matrix using multiprocessing. - Args: - dataset1: timeseries dataset of shape [N1, T1] - dataset2: timeseries dataset of shape [N2, T2] - Returns: - Distance matrix of shape [N1, N2] - """ - n1 = dataset1.shape[0] - n2 = dataset2.shape[0] - dist = np.empty((n1, n2), dtype=np.float64) - - for i in prange(n1): - for j in prange(n2): - dist[i][j] = dtw_distance(dataset1[i], dataset2[j]) - - return dist - - -@njit(cache=True) -def dtw_distance(series1, series2): - """ - Returns the DTW similarity distance between two 1-D - timeseries numpy arrays. - Args: - series1, series2 : array of shape [n_timepoints] - Two arrays containing n_samples of timeseries data - whose DTW distance between each sample of A and B - will be compared. - Returns: - DTW distance between A and B - """ - l1 = series1.shape[0] - l2 = series2.shape[0] - E = np.empty((l1, l2)) - - # Fill First Cell - v = series1[0] - series2[0] - E[0][0] = v * v - - # Fill First Column - for i in range(1, l1): - v = series1[i] - series2[0] - E[i][0] = E[i - 1][0] + v * v - - # Fill First Row - for i in range(1, l2): - v = series1[0] - series2[i] - E[0][i] = E[0][i - 1] + v * v - - for i in range(1, l1): - for j in range(1, l2): - v = series1[i] - series2[j] - v = v * v - - v1 = E[i - 1][j] - v2 = E[i - 1][j - 1] - v3 = E[i][j - 1] - - if v1 <= v2 and v1 <= v3: - E[i][j] = v1 + v - elif v2 <= v1 and v2 <= v3: - E[i][j] = v2 + v - else: - E[i][j] = v3 + v - - return np.sqrt(E[-1][-1]) - - -# Modified from https://github.com/markdregan/K-Nearest-Neighbors-with-Dynamic-Time-Warping -class KnnDTW(object): - """K-nearest neighbor classifier using dynamic time warping - as the distance measure between pairs of time series arrays - Arguments - --------- - n_neighbors : int, optional (default = 1) - Number of neighbors to use by default for KNN - """ - - def __init__(self, n_neighbors=1): - self.n_neighbors = n_neighbors - - def fit(self, x, y): - """Fit the model using x as training data and y as class labels - Arguments - --------- - x : array of shape [n_samples, n_timepoints] - Training data set for input into KNN classifer - y : array of shape [n_samples] - Training labels for input into KNN classifier - """ - - self.x = np.copy(x) - self.y = np.copy(y) - - def _dist_matrix(self, x, y): - """Computes the M x N distance matrix between the training - dataset and testing dataset (y) using the DTW distance measure - Arguments - --------- - x : array of shape [n_samples, n_timepoints] - y : array of shape [n_samples, n_timepoints] - Returns - ------- - Distance matrix between each item of x and y with - shape [training_n_samples, testing_n_samples] - """ - dm = dtw_distance_set(x, y) - - return dm - - def predict(self, x): - """Predict the class labels or probability estimates for - the provided data - Arguments - --------- - x : array of shape [n_samples, n_timepoints] - Array containing the testing data set to be classified - Returns - ------- - 2 arrays representing: - (1) the predicted class labels - (2) the knn label count probability - """ - np.random.seed(0) - dm = self._dist_matrix(x, self.x) - - # Identify the k nearest neighbors - knn_idx = dm.argsort()[:, :self.n_neighbors] - - # Identify k nearest labels - knn_labels = self.y[knn_idx] - - # Model Label - mode_data = mode(knn_labels, axis=1) - mode_label = mode_data[0] - mode_proba = mode_data[1] / self.n_neighbors - - return mode_label.ravel(), mode_proba.ravel() - - def evaluate(self, x, y): - """ - Predict the class labels or probability estimates for - the provided data and then evaluates the accuracy score. - Arguments - --------- - x : array of shape [n_samples, n_timepoints] - Array containing the testing data set to be classified - y : array of shape [n_samples] - Array containing the labels of the testing dataset to be classified - Returns - ------- - 1 floating point value representing the accuracy of the classifier - """ - # Predict the labels and the probabilities - pred_labels, pred_probas = self.predict(x) - - # Ensure labels are integers - y = y.astype('int32') - pred_labels = pred_labels.astype('int32') - - # Compute accuracy measure - accuracy = accuracy_score(y, pred_labels) - return accuracy - - def predict_proba(self, x): - """Predict the class labels probability estimates for - the provided data - Arguments - --------- - x : array of shape [n_samples, n_timepoints] - Array containing the testing data set to be classified - Returns - ------- - 2 arrays representing: - (1) the predicted class probabilities - (2) the knn labels - """ - np.random.seed(0) - dm = self._dist_matrix(x, self.x) - - # Invert the distance matrix - dm = -dm - - classes = np.unique(self.y) - class_dm = [] - - # Partition distance matrix by class - for i, cls in enumerate(classes): - idx = np.argwhere(self.y == cls)[:, 0] - cls_dm = dm[:, idx] # [N_test, N_train_c] - - # Take maximum distance vector due to softmax probabilities - cls_dm = np.max(cls_dm, axis=-1) # [N_test,] - - class_dm.append([cls_dm]) - - # Concatenate the classwise distance matrices and transpose - class_dm = np.concatenate(class_dm, axis=0) # [C, N_test] - class_dm = class_dm.transpose() # [N_test, C] - - # Compute softmax probabilities - class_dm_exp = np.exp(class_dm - class_dm.max()) - class_dm = class_dm_exp / np.sum(class_dm_exp, axis=-1, keepdims=True) - - probabilities = class_dm - knn_labels = np.argmax(class_dm, axis=-1) - - return probabilities, knn_labels \ No newline at end of file diff --git a/convst/experimental/input_transformers.py b/convst/experimental/input_transformers.py deleted file mode 100644 index 222eee5..0000000 --- a/convst/experimental/input_transformers.py +++ /dev/null @@ -1,154 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Dec 10 15:03:19 2021 - -@author: a694772 -""" -from sklearn.base import BaseEstimator, TransformerMixin -from convst.utils.checks_utils import check_array_3D -from numba import vectorize, int32, int64, float32, float64, njit, prange -from pyts.approximation import DiscreteFourierTransform, SymbolicAggregateApproximation -import numpy as np -from scipy.signal import periodogram - -@vectorize([int32(int32, int32), int64(int64, int64), - float32(float32, float32), float64(float64, float64)], - nopython=True, cache=True, fastmath=True) -def _diff(a, b): - return a - b - -@njit(cache=True) -def z_norm_one_sample(x): - n_features, n_timestamps = x.shape - x_new = np.empty((n_features, n_timestamps)) - for i in prange(n_features): - x_new[i] = (x[i] - x[i].mean()) / (x[i].std() + 1e-8) - return x_new - -@njit(cache=True, parallel=True) -def z_norm_all_samples(X): - n_samples, n_features, n_timestamps = X.shape - X_new = np.empty((n_samples, n_features, n_timestamps)) - for i in prange(n_samples): - X_new[i] = z_norm_one_sample(X[i]) - return X_new - -class Z_normalizer(BaseEstimator, TransformerMixin): - def __init__(self): - self.is_univariate=False - - def fit(self, X, y=None): - return self - - def transform(self, X): - X = check_array_3D(X, is_univariate=self.is_univariate) - return z_norm_all_samples(X) - -class Raw(BaseEstimator, TransformerMixin): - def __init__(self): - self.is_univariate=False - - def fit(self, X, y=None): - return self - - def transform(self, X): - X = check_array_3D(X, is_univariate=self.is_univariate) - return X - -class Derivate(BaseEstimator, TransformerMixin): - def __init__(self, order=1, random=False): - self.is_univariate=False - self.order = order - self.random = random - - def fit(self, X, y=None): - if self.random: - self._random_init() - return self - - def transform(self, X): - X = check_array_3D(X, is_univariate=self.is_univariate) - for i in range(self.order): - if X.shape[0]>3: - X = _diff(X[:,:,1:], X[:,:,:-1]) - return X - - def _random_init(self): - self.set_params(**{"order":np.random.choice(np.arange(1,5))}) - -class Periodigram(BaseEstimator, TransformerMixin): - def __init__(self, window_type="boxcar", random=False): - self.window_type = window_type - self.random = random - self.is_univariate=False - - def fit(self, X, y=None): - return self - - def transform(self, X): - X = check_array_3D(X, is_univariate=self.is_univariate)[:,0,:] - if self.random: - self._random_init() - return periodogram(X, detrend=False, window=self.window_type)[1][:, np.newaxis, :] - - def _random_init(self): - self.set_params(**{"window_type":np.random.choice(self._get_windows())}) - - def _get_windows(self): - return np.asarray( - ["boxcar","triang","blackman","hamming","hann", - "bartlett","flattop","parzen","bohman", - "blackmanharris","nuttall","barthann", - "cosine","exponential","tukey","taylor"] - ) - -class Sax(BaseEstimator, TransformerMixin): - def __init__(self, n_bins=10, strategy="quantile", random=False): - # To fix __repr__ ... - self.random = random - self.n_bins = n_bins - self.strategy = strategy - self.is_univariate = True - - def fit(self, X, y=None): - X = check_array_3D(X, is_univariate=self.is_univariate)[:,0,:] - if self.random: - self._random_init(X.shape[1]) - self.transformer = SymbolicAggregateApproximation( - n_bins=self.n_bins, strategy=self.strategy, alphabet='ordinal' - ) - self.transformer.fit(X) - return self - - def transform(self, X): - X = check_array_3D(X, is_univariate=self.is_univariate)[:,0,:] - X = self.transformer.transform(X) - return X[:, np.newaxis, :] - - def _random_init(self, n_timestamps): - self.set_params(**{"n_bins":np.random.choice(np.arange(2,min(n_timestamps,26)))}) - -class FourrierCoefs(BaseEstimator, TransformerMixin): - def __init__(self, n_coefs=None, drop_sum=False, anova=False, - norm_mean=False, norm_std=False): - # To fix __repr__ ... - self.n_coefs = n_coefs - self.drop_sum = drop_sum - self.anova = anova - self.norm_mean = norm_mean - self.norm_std = norm_std - self.is_univariate=True - - def fit(self, X, y=None): - X = check_array_3D(X, is_univariate=self.is_univariate)[:,0,:] - self.transformer = DiscreteFourierTransform( - n_coefs=self.n_coefs, drop_sum=self.drop_sum, anova=self.anova, - norm_std=self.norm_std, norm_mean=self.norm_mean, - ) - self.transformer.fit(X) - return self - - def transform(self, X): - X = check_array_3D(X, is_univariate=self.is_univariate)[:,0,:] - X = self.transformer.transform(X) - return X[:, np.newaxis, :] \ No newline at end of file diff --git a/convst/experimental/rotationforest.py b/convst/experimental/rotationforest.py deleted file mode 100644 index a09f6fe..0000000 --- a/convst/experimental/rotationforest.py +++ /dev/null @@ -1,484 +0,0 @@ -# -*- coding: utf-8 -*- - -import time - -import numpy as np -from joblib import Parallel, delayed -from sklearn.base import BaseEstimator -from sklearn.decomposition import PCA -from sklearn.tree import DecisionTreeClassifier -from sklearn.utils import check_random_state, check_X_y - -from sktime.base._base import _clone_estimator -from sktime.exceptions import NotFittedError -from sktime.utils.validation import check_n_jobs - -class RotationForest(BaseEstimator): - """Rotation Forest Classifier. - Implementation of the Rotation Forest classifier described in Rodriguez et al - (2013). [1]_ - Intended as a benchmark for time series data and a base classifier for - transformation based appraoches such as ShapeletTransformClassifier, this sktime - implementation only works with continuous attributes. - Parameters - ---------- - n_estimators : int, default=200 - Number of estimators to build for the ensemble. - min_group : int, default=3 - The minimum size of a group. - max_group : int, default=3 - The maximum size of a group. - remove_proportion : float, default=0.5 - The proportion of cases to be removed. - base_estimator : BaseEstimator or None, default="None" - Base estimator for the ensemble. By default uses the sklearn - DecisionTreeClassifier using entropy as a splitting measure. - time_limit_in_minutes : int, default=0 - Time contract to limit build time in minutes, overriding n_estimators. - Default of 0 means n_estimators is used. - contract_max_n_estimators : int, default=500 - Max number of estimators when time_limit_in_minutes is set. - save_transformed_data : bool, default=False - Save the data transformed in fit for use in _get_train_probs. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `predict`. - ``-1`` means using all processors. - random_state : int or None, default=None - Seed for random number generation. - Attributes - ---------- - n_classes : int - The number of classes. - n_instances : int - The number of train cases. - n_atts : int - The number of attributes in each train case. - classes_ : list - The classes labels. - estimators_ : list of shape (n_estimators) of BaseEstimator - The collections of estimators trained in fit. - transformed_data : list of shape (n_estimators) of ndarray - The transformed dataset for all classifiers. Only saved when - save_transformed_data is true. - See Also - -------- - ShapeletTransformClassifier - Notes - ----- - For the Java version, see - `TSML `_. - References - ---------- - .. [1] Rodriguez, Juan José, Ludmila I. Kuncheva, and Carlos J. Alonso. "Rotation - forest: A new classifier ensemble method." IEEE transactions on pattern analysis - and machine intelligence 28.10 (2006). - .. [2] Bagnall, A., et al. "Is rotation forest the best classifier for problems - with continuous features?." arXiv preprint arXiv:1809.06705 (2018). - Examples - -------- - >>> from sktime.contrib.vector_classifiers._rotation_forest import RotationForest - >>> from sktime.datasets import load_unit_test - >>> from sktime.datatypes._panel._convert import from_nested_to_3d_numpy - >>> X_train, y_train = load_unit_test(split="train", return_X_y=True) - >>> X_test, y_test = load_unit_test(split="test", return_X_y=True) - >>> X_train = from_nested_to_3d_numpy(X_train) - >>> X_test = from_nested_to_3d_numpy(X_test) - >>> clf = RotationForest(n_estimators=10) - >>> clf.fit(X_train, y_train) - RotationForest(...) - >>> y_pred = clf.predict(X_test) - """ - - def __init__( - self, - n_estimators=200, - min_group=3, - max_group=3, - remove_proportion=0.5, - base_estimator=None, - time_limit_in_minutes=0.0, - contract_max_n_estimators=500, - save_transformed_data=False, - n_jobs=1, - random_state=None, - ): - self.n_estimators = n_estimators - self.min_group = min_group - self.max_group = max_group - self.remove_proportion = remove_proportion - self.base_estimator = base_estimator - - self.time_limit_in_minutes = time_limit_in_minutes - self.contract_max_n_estimators = contract_max_n_estimators - self.save_transformed_data = save_transformed_data - - self.n_jobs = n_jobs - self.random_state = random_state - - self.n_classes = 0 - self.n_instances = 0 - self.n_atts = 0 - self.classes_ = [] - self.estimators_ = [] - self.transformed_data = [] - - self._n_estimators = n_estimators - self._base_estimator = base_estimator - self._min = 0 - self._ptp = 0 - self._useful_atts = [] - self._pcas = [] - self._groups = [] - self._class_dictionary = {} - self._n_jobs = n_jobs - self._n_atts = 0 - # We need to add is-fitted state when inheriting from scikit-learn - self._is_fitted = False - - super(RotationForest, self).__init__() - - def fit(self, X, y): - """Fit a forest of trees on cases (X,y), where y is the target variable. - Parameters - ---------- - X : ndarray of shape = [n_instances,n_attributes] - The training input samples. - y : array-like, shape = [n_instances] - The class labels. - Returns - ------- - self : object - """ - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - elif not isinstance(X, np.ndarray) or len(X.shape) > 2: - raise ValueError( - "RotationForest is not a time series classifier. " - "A 2d numpy array is required." - ) - X, y = check_X_y(X, y) - - self._n_jobs = check_n_jobs(self.n_jobs) - - self.n_instances, self.n_atts = X.shape - self.classes_ = np.unique(y) - self.n_classes = self.classes_.shape[0] - for index, classVal in enumerate(self.classes_): - self._class_dictionary[classVal] = index - - time_limit = self.time_limit_in_minutes * 60 - start_time = time.time() - train_time = 0 - - if self.base_estimator is None: - self._base_estimator = DecisionTreeClassifier(criterion="entropy") - - # replace missing values with 0 and remove useless attributes - X = np.nan_to_num(X, False, 0, 0, 0) - self._useful_atts = ~np.all(X[1:] == X[:-1], axis=0) - X = X[:, self._useful_atts] - - self._n_atts = X.shape[1] - - # normalise attributes - self._min = X.min(axis=0) - self._ptp = X.max(axis=0) - self._min - X = (X - self._min) / self._ptp - - X_cls_split = [X[np.where(y == i)] for i in self.classes_] - - if time_limit > 0: - self._n_estimators = 0 - self.estimators_ = [] - self._pcas = [] - self._groups = [] - - while ( - train_time < time_limit - and self._n_estimators < self.contract_max_n_estimators - ): - fit = Parallel(n_jobs=self._n_jobs)( - delayed(self._fit_estimator)( - X, - X_cls_split, - y, - i, - ) - for i in range(self._n_jobs) - ) - - estimators, pcas, groups, transformed_data = zip(*fit) - - self.estimators_ += estimators - self._pcas += pcas - self._groups += groups - self.transformed_data += transformed_data - - self._n_estimators += self._n_jobs - train_time = time.time() - start_time - else: - fit = Parallel(n_jobs=self._n_jobs)( - delayed(self._fit_estimator)( - X, - X_cls_split, - y, - i, - ) - for i in range(self._n_estimators) - ) - - self.estimators_, self._pcas, self._groups, self.transformed_data = zip( - *fit - ) - - self._is_fitted = True - return self - - def predict(self, X): - """Predict for all cases in X. Built on top of predict_proba. - Parameters - ---------- - X : ndarray of shape = [n_instances,n_attributes] - Returns - ------- - output : array of shape = [n_test_instances] - """ - rng = check_random_state(self.random_state) - return np.array( - [ - self.classes_[int(rng.choice(np.flatnonzero(prob == prob.max())))] - for prob in self.predict_proba(X) - ] - ) - - def predict_proba(self, X): - """Probability estimates for each class for all cases in X. - Parameters - ---------- - X : ndarray of shape = [n_instances,n_attributes] - Returns - ------- - output : array of shape = [n_test_instances, num_classes] of - probabilities - """ - if not self._is_fitted: - raise NotFittedError( - f"This instance of {self.__class__.__name__} has not " - f"been fitted yet; please call `fit` first." - ) - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - elif not isinstance(X, np.ndarray) or len(X.shape) > 2: - raise ValueError( - "RotationForest is not a time series classifier. " - "A 2d numpy array is required." - ) - - # replace missing values with 0 and remove useless attributes - X = np.nan_to_num(X, False, 0, 0, 0) - X = X[:, self._useful_atts] - - # normalise the data. - X = (X - self._min) / self._ptp - - y_probas = Parallel(n_jobs=self._n_jobs)( - delayed(self._predict_proba_for_estimator)( - X, - self.estimators_[i], - self._pcas[i], - self._groups[i], - ) - for i in range(self._n_estimators) - ) - - output = np.sum(y_probas, axis=0) / ( - np.ones(self.n_classes) * self._n_estimators - ) - return output - - def _get_train_probs(self, X, y): - if not self._is_fitted: - raise NotFittedError( - f"This instance of {self.__class__.__name__} has not " - f"been fitted yet; please call `fit` first." - ) - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - elif not isinstance(X, np.ndarray) or len(X.shape) > 2: - raise ValueError( - "RotationForest is not a time series classifier. " - "A 2d numpy array is required." - ) - - n_instances, n_atts = X.shape - - if n_instances != self.n_instances or n_atts != self.n_atts: - raise ValueError( - "n_instances, n_dims, series_length mismatch. X should be " - "the same as the training data used in fit for generating train " - "probabilities." - ) - - if not self.save_transformed_data: - raise ValueError("Currently only works with saved transform data from fit.") - - p = Parallel(n_jobs=self._n_jobs)( - delayed(self._train_probas_for_estimator)( - y, - i, - ) - for i in range(self._n_estimators) - ) - y_probas, oobs = zip(*p) - - results = np.sum(y_probas, axis=0) - divisors = np.zeros(n_instances) - for oob in oobs: - for inst in oob: - divisors[inst] += 1 - - for i in range(n_instances): - results[i] = ( - np.ones(self.n_classes) * (1 / self.n_classes) - if divisors[i] == 0 - else results[i] / (np.ones(self.n_classes) * divisors[i]) - ) - - return results - - def _fit_estimator(self, X, X_cls_split, y, idx): - rs = 255 if self.random_state == 0 else self.random_state - rs = ( - None - if self.random_state is None - else (rs * 37 * (idx + 1)) % np.iinfo(np.int32).max - ) - rng = check_random_state(rs) - - groups = self._generate_groups(rng) - pcas = [] - - # construct the slices to fit the PCAs too. - for group in groups: - classes = rng.choice( - range(self.n_classes), - size=rng.randint(1, self.n_classes + 1), - replace=False, - ) - - # randomly add the classes with the randomly selected attributes. - X_t = np.zeros((0, len(group))) - for cls_idx in classes: - c = X_cls_split[cls_idx] - X_t = np.concatenate((X_t, c[:, group]), axis=0) - - sample_ind = rng.choice( - X_t.shape[0], - int(X_t.shape[0] * self.remove_proportion), - replace=False, - ) - X_t = X_t[sample_ind] - - # try to fit the PCA if it fails, remake it, and add 10 random data instances. - while True: - # ignore err state on PCA because we account if it fails. - with np.errstate(divide="ignore", invalid="ignore"): - # differences between os occasionally. seems to happen when there - # are low amounts of cases in the fit - pca = PCA(random_state=rs).fit(X_t) - - if not np.isnan(pca.explained_variance_ratio_).all(): - break - X_t = np.concatenate( - (X_t, rng.random_sample((10, X_t.shape[1]))), axis=0 - ) - - pcas.append(pca) - - # merge all the pca_transformed data into one instance and build a classifier on it. - X_t = np.concatenate( - [pcas[i].transform(X[:, group]) for i, group in enumerate(groups)], axis=1 - ) - tree = _clone_estimator(self._base_estimator, random_state=rs) - tree.fit(X_t, y) - - return tree, pcas, groups, X_t if self.save_transformed_data else None - - def _predict_proba_for_estimator(self, X, clf, pcas, groups): - X_t = np.concatenate( - [pcas[i].transform(X[:, group]) for i, group in enumerate(groups)], axis=1 - ) - probas = clf.predict_proba(X_t) - - if probas.shape[1] != self.n_classes: - new_probas = np.zeros((probas.shape[0], self.n_classes)) - for i, cls in enumerate(clf.classes_): - cls_idx = self._class_dictionary[cls] - new_probas[:, cls_idx] = probas[:, i] - probas = new_probas - - return probas - - def _train_probas_for_estimator(self, y, idx): - rs = 255 if self.random_state == 0 else self.random_state - rs = ( - None - if self.random_state is None - else (rs * 37 * (idx + 1)) % np.iinfo(np.int32).max - ) - rng = check_random_state(rs) - - indices = range(self.n_instances) - subsample = rng.choice(self.n_instances, size=self.n_instances) - oob = [n for n in indices if n not in subsample] - - clf = _clone_estimator(self._base_estimator, rs) - clf.fit(self.transformed_data[idx][subsample], y[subsample]) - probas = clf.predict_proba(self.transformed_data[idx][oob]) - - if probas.shape[1] != self.n_classes: - new_probas = np.zeros((probas.shape[0], self.n_classes)) - for i, cls in enumerate(clf.classes_): - cls_idx = self._class_dictionary[cls] - new_probas[:, cls_idx] = probas[:, i] - probas = new_probas - - results = np.zeros((self.n_instances, self.n_classes)) - for n, proba in enumerate(probas): - results[oob[n]] += proba - - return [results, oob] - - def _generate_groups(self, rng): - permutation = rng.permutation((np.arange(0, self._n_atts))) - - # select the size of each group. - group_size_count = np.zeros(self.max_group - self.min_group + 1) - n_attributes = 0 - n_groups = 0 - while n_attributes < self._n_atts: - n = rng.randint(group_size_count.shape[0]) - group_size_count[n] += 1 - n_attributes += self.min_group + n - n_groups += 1 - - groups = [] - current_attribute = 0 - current_size = 0 - for i in range(0, n_groups): - while group_size_count[current_size] == 0: - current_size += 1 - group_size_count[current_size] -= 1 - - n = self.min_group + current_size - groups.append(np.zeros(n, dtype=np.int)) - for k in range(0, n): - if current_attribute < permutation.shape[0]: - groups[i][k] = permutation[current_attribute] - else: - groups[i][k] = permutation[rng.randint(permutation.shape[0])] - current_attribute += 1 - - return groups - diff --git a/convst/experimental/visualisations.py b/convst/experimental/visualisations.py deleted file mode 100644 index 140f60f..0000000 --- a/convst/experimental/visualisations.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Nov 18 13:17:07 2021 - -@author: a694772 -""" -#Totally Random Shapelet vs Rocket for two baselines -# Compare Performance vs Number of SHapelets -# Seuil a determiné par augmentation progressive du seuil, -# regarder combien d'occurence entre deux séries, et faire abs diff et prendre agrmax - -import seaborn as sns -from matplotlib import pyplot as plt -import numpy as np - -def class_vis(rdg, i_class, c): - fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(23,5)) - ix = np.zeros(rdg.coef_[0].shape[0]) - ix[1::3] = 1 - ix[2::3] = 2 - - sns.boxplot(x=ix, y=rdg.coef_[i_class],ax=ax[0]) - ax[0].set_xticks([0,1,2]) - ax[0].set_xticklabels(['min','argmin','#match']) - ax[0].set_ylabel('Ridge coef for class '+str(i_class)) - ax[1].set_ylabel('Ridge coef for class '+str(i_class)) - ax[1].set_xlabel('dilation') - coef_sums = (rdg.coef_[i_class][0::3] + rdg.coef_[i_class][1::3] + rdg.coef_[i_class][2::3])/3 - sns.boxplot(x=c.dilation_, y=coef_sums,ax=ax[1]) - - #sns.boxplot(x=c.length_, y=coef_sums,ax=ax[2]) - -def report(shp, ridge, X, y, xt=None, k=1, only_global=True): - coefs = ridge.coef_ - if np.bincount(y).shape[0] == 2: - coefs = np.append(-coefs, coefs, axis=0) - - for i_class in range(coefs.shape[0]): - c = coefs[i_class] - shp_coefs = c[1::3] + c[2::3] + c[0::3] - idx = c.argsort() - coefs_min = c[0::3] - coefs_argmin = c[1::3] - coefs_match = c[2::3] - fig, ax = plt.subplots(ncols=2,figsize=(15,5)) - sns.distplot(coefs_min,label='min',ax=ax[0]) - sns.distplot(coefs_argmin,label='argmin',ax=ax[0]) - sns.distplot(coefs_match,label='match',ax=ax[0]) - ax[0].legend() - ax[0].set_title('feature importance class {}'.format(i_class)) - sns.distplot(shp_coefs,ax=ax[1]) - plt.show() - if not only_global: - top_k = idx[-k:]//3 - low_k = idx[:k]//3 - for i_k in range(k): - shp.visualise_one_shapelet(top_k[i_k], X, y, title='Top {} shapelet of class {}'.format(k, i_class)) - shp.visualise_one_shapelet(low_k[i_k], X, y, title='Worse {} shapelet of class {}'.format(k, i_class)) - if xt is not None: - fig, ax = plt.subplots(ncols=2, figsize=(15,5)) - for j_class in range(coefs.shape[0]): - ax[0].scatter(xt[y==j_class, (top_k[i_k]*3)+0]*c[(top_k[i_k]*3)+0], - xt[y==j_class, (top_k[i_k]*3)+2]*c[(top_k[i_k]*3)+2], - c='C'+str(j_class), - s=45, alpha=0.85) - ax[1].scatter(xt[y==j_class, (low_k[i_k]*3)+0]*c[(low_k[i_k]*3)+0], - xt[y==j_class, (low_k[i_k]*3)+2]*c[(low_k[i_k]*3)+2], - c='C'+str(j_class), - s=45, alpha=0.85, label=j_class) - ax[1].legend() - plt.show() - -#report(c, rf['ridgeclassifiercv'], X_test, y_test, rf['standardscaler'].transform(xt)) \ No newline at end of file diff --git a/setup.py b/setup.py index e498ff7..0a643a8 100644 --- a/setup.py +++ b/setup.py @@ -27,13 +27,12 @@ install_requires=[ "matplotlib >= 3.5", "numba >= 0.55", - "pandas >= 1.3", - "scikit_learn >= 1.0", + "pandas >= 1.4", + "scikit_learn >= 1.1", "joblib >= 1.1.0", - "pyts >= 0.12", - "scipy >= 1.7", + "scipy >= 1.8", "seaborn >= 0.11", - "sktime >= 0.11", + "sktime >= 0.12", "numpy < 1.22, >=1.18", "networkx >= 2.6.3", "pytest >= 7.0",