From 18ecf1f9593dcad300b647ecfe919219c6ef2949 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 6 Jun 2023 20:58:28 +0200 Subject: [PATCH 1/2] improve api, bump version --- README.md | 112 +++++++++--------- datasieve/pipeline.py | 14 +++ datasieve/transforms/__init__.py | 4 +- datasieve/transforms/base_transform.py | 22 ++++ datasieve/transforms/dbscan.py | 31 ++--- datasieve/transforms/dissimilarity_index.py | 3 +- datasieve/transforms/minmax_scaler.py | 32 ----- datasieve/transforms/pca.py | 15 +-- datasieve/transforms/sklearn_wrapper.py | 33 ++++++ datasieve/transforms/svm_outlier_extractor.py | 9 +- datasieve/transforms/variance_threshold.py | 9 +- pyproject.toml | 4 +- tests/test_pipeline.py | 32 ++--- tests/test_transforms.py | 5 +- 14 files changed, 189 insertions(+), 136 deletions(-) create mode 100644 datasieve/transforms/base_transform.py delete mode 100644 datasieve/transforms/minmax_scaler.py create mode 100644 datasieve/transforms/sklearn_wrapper.py diff --git a/README.md b/README.md index 927df57..6485f4c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ DataSieve is very similar to the SKlearn Pipeline in that it: - transforms subsequent arrays of the same dimension according to the fit from the original X - inverse transforms arrays by inverting the series of transformations -This means that it follows the SKLearn API very closely, and in fact most of the methods inherit directly from SKLearn methods. +This means that it follows the SKLearn API very closely, and in fact users can use SKLearn transforms directly without making any modifications. The main **difference** is that DataSieve allows for the manipulation of the y and sample_weight arrays in addition to the X array. This is useful if you find yourself wishing to use the SKLearn pipeline for: @@ -20,66 +20,21 @@ The main **difference** is that DataSieve allows for the manipulation of the y a These improved flexibilities allow for more customized/creative transformations. For example, the included `DataSieveDBSCAN` has automated parameter fitting and outlier removal based on clustering. -An example would be someone who wants to use `SGDOneClassSVM` to detect and remove outliers from their data set before training: - -```python -class SVMOutlierExtractor(SGDOneClassSVM): - """ - A subclass of the SKLearn SGDOneClassSVM that adds a transform() method - for removing detected outliers from X (as well as the associated y and - sample_weight if they are also furnished. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - self.fit(X, y, sample_weight=sample_weight) - return self.transform(X, y, sample_weight=sample_weight) - - def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X, y=y, sample_weight=sample_weight) - return X, y, sample_weight, feature_list - - def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - y_pred = self.predict(X) - - X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred) - - num_tossed = len(y_pred) - len(X) - if num_tossed > 0: - logger.info( - f"SVM detected {num_tossed} data points " - "as outliers." - ) - - return X, y, sample_weight, feature_list - - def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - """ - Unused, pass through X, y, sample_weight, and feature_list - """ - return X, y, sample_weight, feature_list -``` - - -As shown here, the `fit()` method is actually identical to the SKLearn `fit()` method, but the `transform()` removes data points from X, y, and sample_weight for any outliers detected in the `X` array. - -# Usage -The user builds the pipeline similarly to SKLearn: +## Usage +The user builds the pipeline similarly to SKLearn, and can even use SKLearn transforms directly with the `SKLearnWrapper`: ```python from datasieve.pipeline import Pipeline from datasieve.transforms import DataSieveMinMaxScaler, DataSievePCA, DataSieveVarianceThreshold, SVMOutlierExtractor + from datasieve.transforms import SKlearnWrapper feature_pipeline = Pipeline([ ("detect_constants", DataSieveVarianceThreshold(threshold=0)), - ("pre_svm_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))), + ("pre_svm_scaler", SKlearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), ("svm", SVMOutlierExtractor()), - ("pre_pca_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))), - ("pca", DataSievePCA(n_components=0.95), - ("post_pca_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))) + ("pca", DataSievePCA(n_components=0.95)), + ("post_pca_scaler", SKlearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) ``` @@ -96,7 +51,6 @@ Next, the `feature_pipeline` can then be used to transform other datasets with t ```python X2, _, _ = feature_pipeline.transform(X2) - ``` Finally, similar to SKLearn's pipeline, the `feature_pipeline` can be used to inverse_transform an array `X3` array that has the same dimensions as the returned `X` array from the pipeline: @@ -105,6 +59,58 @@ Finally, similar to SKLearn's pipeline, the `feature_pipeline` can be used to in Xinv, _ ,_ = feature_pipeline.inverse_transform(X) ``` + +## Creating a custom transform + +An example would be someone who wants to use `SGDOneClassSVM` to detect and remove outliers from their data set before training: + +```python +class SVMOutlierExtractor(BaseTransform): + """ + A subclass of the SKLearn SGDOneClassSVM that adds a transform() method + for removing detected outliers from X (as well as the associated y and + sample_weight if they are also furnished. + """ + + def __init__(self, **kwargs): + self._skl = SGDOneClassSVM(**kwargs) + + def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + self.fit(X, y, sample_weight=sample_weight) + return self.transform(X, y, sample_weight, feature_list) + + def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + self._skl.fit(X, y=y, sample_weight=sample_weight) + return X, y, sample_weight, feature_list + + def transform(self, X, y=None, sample_weight=None, feature_list=None, + outlier_check=False, **kwargs): + y_pred = self._skl.predict(X) + y_pred = np.where(y_pred == -1, 0, y_pred) + if not outlier_check: + X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred) + num_tossed = len(y_pred) - len(X) + if num_tossed > 0: + logger.info( + f"SVM detected {num_tossed} data points " + "as outliers." + ) + else: + y += y_pred + y -= 1 + + return X, y, sample_weight, feature_list + + def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + """ + Unused + """ + return X, y, sample_weight, feature_list +``` + + +As shown here, the `fit()` method is actually identical to the SKLearn `fit()` method, but the `transform()` removes data points from X, y, and sample_weight for any outliers detected in the `X` array. + ## Data removal The command `feature_pipeline.fit_transform(X, y, sample_weight)` fits each pipeline step to `X`, and transforms `X` according to each step's `transform()` method. In some cases, this will not affect `y` or `sample_weight`. For example, `FlowdaptMinMaxScaler` simply scales `X` and saves the normalization information. Meanwhile, in the `SVMOutlierExtractor`, `.fit()` will fit an SVM to `X` and `.transform()` will remove any detected outliers from `X`. Typical `Scikit-Learn` pipelines do not remove those data points from `y` and `sample_weight`. Luckily, the `FlowdaptPipeline` takes care of the "associated removal" of the same outlier data points from `y` and `sample_weight`. diff --git a/datasieve/pipeline.py b/datasieve/pipeline.py index a76f0dc..b2683fa 100644 --- a/datasieve/pipeline.py +++ b/datasieve/pipeline.py @@ -23,6 +23,7 @@ def __init__(self, steps: List[Tuple] = [], self.pandas_types: bool = False self.feature_list: list = [] self.label_list: list = [] + self.step_strings: list = [] def _validate_fitparams(self, fitparams: Dict[str, dict], steps: List[Tuple]): for _, (name, _) in enumerate(steps): @@ -38,6 +39,19 @@ def __getitem__(self, name: str): logger.warning(f"Could not find step {name} in pipeline, returning None") return None + + def append(self, step: Tuple[str, object], fitparams: dict = {}): + """ + Append a step to the pipeline + :param step: tuple of (str, transform()) + :param fitparams: dictionary of parameters to pass to fit + """ + if step[0] in self.step_strings: + raise ValueError(f"Step name {step[0]} already exists in pipeline." + "Ensure each step has a unique name.") + self.step_strings.append(step[0]) + self.steps += [step] + self.fitparams[step[0]] = fitparams def fit_transform(self, X, y=None, sample_weight=None) -> Tuple[npt.ArrayLike, npt.ArrayLike, diff --git a/datasieve/transforms/__init__.py b/datasieve/transforms/__init__.py index 1bb1f42..1a19143 100644 --- a/datasieve/transforms/__init__.py +++ b/datasieve/transforms/__init__.py @@ -2,14 +2,14 @@ from datasieve.transforms.svm_outlier_extractor import SVMOutlierExtractor from datasieve.transforms.pca import DataSievePCA from datasieve.transforms.dbscan import DataSieveDBSCAN -from datasieve.transforms.minmax_scaler import DataSieveMinMaxScaler from datasieve.transforms.variance_threshold import DataSieveVarianceThreshold +from datasieve.transforms.sklearn_wrapper import SKLearnWrapper __all__ = ( "DissimilarityIndex", "SVMOutlierExtractor", "DataSievePCA", "DataSieveDBSCAN", - "DataSieveMinMaxScaler", "DataSieveVarianceThreshold", + "SKLearnWrapper", ) diff --git a/datasieve/transforms/base_transform.py b/datasieve/transforms/base_transform.py new file mode 100644 index 0000000..319ab2f --- /dev/null +++ b/datasieve/transforms/base_transform.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod + + +class BaseTransform(ABC): + """ + Base class for all transforms. + """ + + def __init__(self, name: str): + self.name = name + + def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + return X, y, sample_weight, feature_list + + def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + return X, y, sample_weight, feature_list + + def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + return X, y, sample_weight, feature_list + + def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + return X, y, sample_weight, feature_list diff --git a/datasieve/transforms/dbscan.py b/datasieve/transforms/dbscan.py index d2f5e2e..1ef7353 100644 --- a/datasieve/transforms/dbscan.py +++ b/datasieve/transforms/dbscan.py @@ -5,11 +5,12 @@ from sklearn.cluster import DBSCAN from sklearn.neighbors import NearestNeighbors from datasieve.utils import remove_outliers +from datasieve.transforms.base_transform import BaseTransform logger = logging.getLogger('datasieve.pipeline') -class DataSieveDBSCAN(DBSCAN): +class DataSieveDBSCAN(BaseTransform): """ A subclass of the SKLearn DBSCAN that ensures fit, transform, fit_transform and inverse_transform all take the full set of params X, y, sample_weight (even if they @@ -22,7 +23,7 @@ class DataSieveDBSCAN(DBSCAN): """ def __init__(self, backend="loky", n_jobs=-1, **kwargs) -> None: - super().__init__(**kwargs) + self._skl: DBSCAN = DBSCAN(**kwargs) self.train_features: npt.ArrayLike = np.array([]) self.backend = backend self.n_jobs = n_jobs @@ -38,13 +39,13 @@ def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwar # appends X to the self.train_features in order to determine # outliers, so we avoid that duplication by ensuring that # fit_transform simply uses the primary train_features only. - inliers = np.where(self.labels_ == -1, 0, 1) + inliers = np.where(self._skl.labels_ == -1, 0, 1) X, y, sample_weight = remove_outliers(X, y, sample_weight, inliers) logger.info( f"DBSCAN tossed {len(inliers) - X.shape[0]}" - f" train points from {len(self.labels_)} in fit_transform()" + f" train points from {len(self._skl.labels_)} in fit_transform()" ) return X, y, sample_weight, feature_list @@ -54,11 +55,11 @@ def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): Given a set of training features, find the best epsilond and min_samples """ - self.eps, self.min_samples = self.compute_epsilon_and_minpts(X) - logger.info(f"Found eps {self.eps} and min_samples {self.min_samples} in fit") + self._skl.eps, self._skl.min_samples = self.compute_epsilon_and_minpts(X) + logger.info(f"Found eps {self._skl.eps} and min_samples {self._skl.eps} in fit") with parallel_backend(self.backend, n_jobs=self.n_jobs): - super().fit(X) + self._skl.fit(X) self.train_features = X @@ -75,8 +76,8 @@ def transform(self, X, y=None, sample_weight=None, feature_list=None, fullX = np.concatenate([self.train_features, X], axis=0) with parallel_backend(self.backend, n_jobs=self.n_jobs): - logger.info(f"Using eps {self.eps} and min_samples {self.min_samples} to transform") - clustering = super().fit(fullX) + logger.info(f"Using eps {self._skl.eps} and min_samples {self._skl.min_samples} to transform") + clustering = self._skl.fit(fullX) inliers = np.where(clustering.labels_[-num_X:] == -1, 0, 1) @@ -84,7 +85,7 @@ def transform(self, X, y=None, sample_weight=None, feature_list=None, X, y, sample_weight = remove_outliers(X, y, sample_weight, inliers=inliers) logger.info( f"DBSCAN tossed {len(inliers) - X.shape[0]}" - f" train points from {len(self.labels_)} in transform()" + f" train points from {len(self._skl.labels_)} in transform()" ) else: y += inliers @@ -92,11 +93,11 @@ def transform(self, X, y=None, sample_weight=None, feature_list=None, return X, y, sample_weight, feature_list - def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - """ - Unused - """ - return X, y, sample_weight, feature_list + # def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + # """ + # Unused + # """ + # return X, y, sample_weight, feature_list def compute_epsilon_and_minpts(self, X): """ diff --git a/datasieve/transforms/dissimilarity_index.py b/datasieve/transforms/dissimilarity_index.py index d7730b3..a527b21 100644 --- a/datasieve/transforms/dissimilarity_index.py +++ b/datasieve/transforms/dissimilarity_index.py @@ -1,5 +1,6 @@ import logging from sklearn.metrics.pairwise import pairwise_distances +from datasieve.transforms.base_transform import BaseTransform import numpy as np import numpy.typing as npt from joblib import parallel_backend @@ -8,7 +9,7 @@ logger = logging.getLogger('datasieve.pipeline') -class DissimilarityIndex: +class DissimilarityIndex(BaseTransform): """ Object designed for computing the dissimilarity index for a set of training data and prediction points. fit() computes the avg_mean distance for the training data and diff --git a/datasieve/transforms/minmax_scaler.py b/datasieve/transforms/minmax_scaler.py deleted file mode 100644 index f20c20b..0000000 --- a/datasieve/transforms/minmax_scaler.py +++ /dev/null @@ -1,32 +0,0 @@ -from sklearn.preprocessing import MinMaxScaler -import logging - -logger = logging.getLogger('datasieve.pipeline') - - -class DataSieveMinMaxScaler(MinMaxScaler): - """ - A subclass of the SKLearn MinMaxScaler that ensures fit, transform, fit_transform and - inverse_transform all take the full set of params X, y, sample_weight (even if they - are unused) to follow the FlowdaptPipeline API. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X) - X = super().transform(X) - return X, y, sample_weight, feature_list - - def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X) - return X, y, sample_weight, feature_list - - def transform(self, X, y=None, sample_weight=None, - feature_list=None, outlier_check=False, **kwargs): - X = super().transform(X) - return X, y, sample_weight, feature_list - - def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - return super().inverse_transform(X), y, sample_weight, feature_list diff --git a/datasieve/transforms/pca.py b/datasieve/transforms/pca.py index 1b4e204..90dc1a4 100644 --- a/datasieve/transforms/pca.py +++ b/datasieve/transforms/pca.py @@ -1,11 +1,12 @@ from sklearn.decomposition import PCA +from datasieve.transforms.base_transform import BaseTransform import logging import numpy as np logger = logging.getLogger('datasieve.pipeline') -class DataSievePCA(PCA): +class DataSievePCA(BaseTransform): """ A subclass of the SKLearn PCA that ensures fit, transform, fit_transform and inverse_transform all take the full set of params X, y, sample_weight (even if they @@ -13,7 +14,7 @@ class DataSievePCA(PCA): """ def __init__(self, n_components=0.9999, **kwargs): - super().__init__(n_components=n_components, **kwargs) + self._skl = PCA(n_components=n_components, **kwargs) def fit_transform(self, X, y=None, sample_weight=None, feature_list=None): X, y, sample_weight, feature_list = self.fit(X, y, sample_weight, feature_list) @@ -21,18 +22,18 @@ def fit_transform(self, X, y=None, sample_weight=None, feature_list=None): def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): n_components = X.shape[1] - super().fit(X) + self._skl.fit(X) - n_keep_components = self.n_components_ + n_keep_components = self._skl.n_components_ self.feature_list = [f"PC{i}" for i in range(0, n_keep_components)] logger.info(f"reduced feature dimension by {n_components - n_keep_components}") - logger.info(f"explained variance {np.sum(self.explained_variance_ratio_)}") + logger.info(f"explained variance {np.sum(self._skl.explained_variance_ratio_)}") return X, y, sample_weight, self.feature_list def transform(self, X, y=None, sample_weight=None, outlier_check=False, feature_list=None, **kwargs): - X = super().transform(X) + X = self._skl.transform(X) return X, y, sample_weight, self.feature_list def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - return super().inverse_transform(X), y, sample_weight, feature_list + return self._skl.inverse_transform(X), y, sample_weight, feature_list diff --git a/datasieve/transforms/sklearn_wrapper.py b/datasieve/transforms/sklearn_wrapper.py new file mode 100644 index 0000000..d0e8ad5 --- /dev/null +++ b/datasieve/transforms/sklearn_wrapper.py @@ -0,0 +1,33 @@ +from datasieve.transforms.base_transform import BaseTransform +from sklearn.base import BaseEstimator +from joblib import parallel_backend + +class SKLearnWrapper(BaseTransform): + """ + Wrapper that takes *most* SKLearn transforms and allows them to + work wiith the datasieve pipeline + """ + def __init__(self, sklearninstance: BaseEstimator, n_jobs=-1, backend="loky", **kwargs): + self.backend = backend + self.n_jobs = n_jobs + self._skl = sklearninstance + + def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + with parallel_backend(self.backend, n_jobs=self.n_jobs): + self._skl = self._skl.fit(X, y=y) + return X, y, sample_weight, feature_list + + def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + with parallel_backend(self.backend, n_jobs=self.n_jobs): + X = self._skl.transform(X) + return X, y, sample_weight, feature_list + + def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + self.fit(X, y=y, sample_weight=sample_weight, feature_list=feature_list) + X, y, sample_weight, feature_list = self.transform(X, y=y, sample_weight=sample_weight, + feature_list=feature_list) + return X, y, sample_weight, feature_list + + def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + X = self._skl.inverse_transform(X) + return X, y, sample_weight, feature_list diff --git a/datasieve/transforms/svm_outlier_extractor.py b/datasieve/transforms/svm_outlier_extractor.py index 92809aa..f965be6 100644 --- a/datasieve/transforms/svm_outlier_extractor.py +++ b/datasieve/transforms/svm_outlier_extractor.py @@ -1,4 +1,5 @@ from sklearn.linear_model import SGDOneClassSVM +from datasieve.transforms.base_transform import BaseTransform from datasieve.utils import remove_outliers import logging import numpy as np @@ -6,7 +7,7 @@ logger = logging.getLogger('datasieve.pipeline') -class SVMOutlierExtractor(SGDOneClassSVM): +class SVMOutlierExtractor(BaseTransform): """ A subclass of the SKLearn SGDOneClassSVM that adds a transform() method for removing detected outliers from X (as well as the associated y and @@ -14,19 +15,19 @@ class SVMOutlierExtractor(SGDOneClassSVM): """ def __init__(self, **kwargs): - super().__init__(**kwargs) + self._skl = SGDOneClassSVM(**kwargs) def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): self.fit(X, y, sample_weight=sample_weight) return self.transform(X, y, sample_weight, feature_list) def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X, y=y, sample_weight=sample_weight) + self._skl.fit(X, y=y, sample_weight=sample_weight) return X, y, sample_weight, feature_list def transform(self, X, y=None, sample_weight=None, feature_list=None, outlier_check=False, **kwargs): - y_pred = self.predict(X) + y_pred = self._skl.predict(X) y_pred = np.where(y_pred == -1, 0, y_pred) if not outlier_check: X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred) diff --git a/datasieve/transforms/variance_threshold.py b/datasieve/transforms/variance_threshold.py index 3ee490b..59cfa2f 100644 --- a/datasieve/transforms/variance_threshold.py +++ b/datasieve/transforms/variance_threshold.py @@ -1,14 +1,15 @@ import logging import numpy as np from sklearn.feature_selection import VarianceThreshold +from datasieve.transforms.base_transform import BaseTransform logger = logging.getLogger('datasieve.pipeline') -class DataSieveVarianceThreshold(VarianceThreshold): +class DataSieveVarianceThreshold(BaseTransform): def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) + self._skl: VarianceThreshold = VarianceThreshold(**kwargs) self.feature_list: list = [] self.mask = None @@ -17,8 +18,8 @@ def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwar return self.transform(X, y, sample_weight, feature_list) def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X) - self.mask = self.get_support() + self._skl.fit(X) + self.mask = self._skl.get_support() if feature_list is not None: self.feature_list = np.array(feature_list)[self.mask] logger.info("Variance will remove features " diff --git a/pyproject.toml b/pyproject.toml index cf92847..1238073 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "datasieve" -version = "0.1.0" +version = "0.1.1" description = "This package implements a flexible data pipeline to help organize row removal (e.g. outlier removal) and feature modification (e.g. PCA)" authors = ['Robert Caulk'] readme = "README.md" @@ -12,7 +12,7 @@ scikit-learn = ">=1.1.3" pandas = ">=1.3.3" [tool.poetry.dev-dependencies] -pytest = "^6.2.1" +pytest = "^7.3.1" autopep8 = "1.6.0" flake8 = "^6.0.0" diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index cfd617e..7955b6c 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,7 +1,9 @@ from datasieve.pipeline import Pipeline -import datasieve.transforms as transforms +import datasieve.transforms as ts from conftest import extract_features_and_labels, set_weights_higher_recent import numpy as np +from sklearn.preprocessing import MinMaxScaler +from datasieve.transforms.sklearn_wrapper import SKLearnWrapper def test_pipeline_df_different_features_in_out(dummy_df_without_nans): @@ -10,12 +12,12 @@ def test_pipeline_df_different_features_in_out(dummy_df_without_nans): """ pipeline = Pipeline([ - ("detect_constants", transforms.DataSieveVarianceThreshold(threshold=0)), - ("pre_svm_scaler", transforms.DataSieveMinMaxScaler(feature_range=(-1, 1))), - ("svm", transforms.SVMOutlierExtractor()), - ("pre_pca_scaler", transforms.DataSieveMinMaxScaler(feature_range=(-1, 1))), - ("pca", transforms.DataSievePCA(n_components=0.95)), - ("post_pca_scaler", transforms.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ("detect_constants", ts.DataSieveVarianceThreshold(threshold=0)), + ("pre_svm_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ("svm", ts.SVMOutlierExtractor()), + ("pre_pca_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ("pca", ts.DataSievePCA(n_components=0.95)), + ("post_pca_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) df = dummy_df_without_nans.copy() @@ -32,8 +34,8 @@ def test_pipeline_df_different_features_in_out(dummy_df_without_nans): def test_pipeline_df_same_features_in_out(dummy_df_without_nans): pipeline = Pipeline([ - ("pre_svm_scaler", transforms.DataSieveMinMaxScaler()), - ("svm", transforms.SVMOutlierExtractor()) + ("pre_svm_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ("svm", ts.SVMOutlierExtractor()) ]) df = dummy_df_without_nans.copy() @@ -49,8 +51,8 @@ def test_pipeline_df_same_features_in_out(dummy_df_without_nans): def test_pipeline_array_in_out(dummy_array_without_nans): pipeline = Pipeline([ - ("pre_svm_scaler", transforms.DataSieveMinMaxScaler()), - ("svm", transforms.SVMOutlierExtractor()) + ("pre_svm_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ("svm", ts.SVMOutlierExtractor()) ]) X = dummy_array_without_nans.copy() @@ -69,8 +71,8 @@ def test_check_outliers(dummy_array_without_nans): """ pipeline = Pipeline([ - ("pre_svm_scaler", transforms.DataSieveMinMaxScaler(feature_range=(-1, 1))), - ("svm", transforms.SVMOutlierExtractor(nu=0.01, shuffle=True, random_state=42)) + ("pre_svm_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ("svm", ts.SVMOutlierExtractor(nu=0.01, shuffle=True, random_state=42)) ]) X = dummy_array_without_nans.copy() @@ -87,8 +89,8 @@ def test_check_outliers(dummy_array_without_nans): def test_getitem(dummy_array_without_nans, dummy_array2_without_nans): pipeline = Pipeline([ - ("pre_svm_scaler", transforms.DataSieveMinMaxScaler(feature_range=(-1, 1))), - ("di", transforms.DissimilarityIndex(di_threshold=0.9)) + ("pre_svm_scaler", SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ("di", ts.DissimilarityIndex(di_threshold=0.9)) ]) X = dummy_array_without_nans.copy() diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 63dbaa5..4837def 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1,4 +1,7 @@ import datasieve.transforms as transforms +from datasieve.transforms.sklearn_wrapper import SKLearnWrapper +from sklearn.preprocessing import MinMaxScaler +from sklearn.feature_selection import VarianceThreshold def test_min_max_scaler(dummy_array_without_nans): @@ -6,7 +9,7 @@ def test_min_max_scaler(dummy_array_without_nans): Test the min max scaler """ X = dummy_array_without_nans.copy() - scaler = transforms.DataSieveMinMaxScaler(feature_range=(-1, 1)) + scaler = SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))) X, _, _, _ = scaler.fit_transform(X) Y = dummy_array_without_nans.copy() Y, _, _, _ = scaler.transform(Y) From 943bfc3faf604fe600aa68dabdb9ebd6400e99ef Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 6 Jun 2023 21:01:10 +0200 Subject: [PATCH 2/2] fix flake8 --- datasieve/transforms/base_transform.py | 2 +- datasieve/transforms/dbscan.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/datasieve/transforms/base_transform.py b/datasieve/transforms/base_transform.py index 319ab2f..63edc1e 100644 --- a/datasieve/transforms/base_transform.py +++ b/datasieve/transforms/base_transform.py @@ -1,4 +1,4 @@ -from abc import ABC, abstractmethod +from abc import ABC class BaseTransform(ABC): diff --git a/datasieve/transforms/dbscan.py b/datasieve/transforms/dbscan.py index 1ef7353..56b32ed 100644 --- a/datasieve/transforms/dbscan.py +++ b/datasieve/transforms/dbscan.py @@ -76,7 +76,8 @@ def transform(self, X, y=None, sample_weight=None, feature_list=None, fullX = np.concatenate([self.train_features, X], axis=0) with parallel_backend(self.backend, n_jobs=self.n_jobs): - logger.info(f"Using eps {self._skl.eps} and min_samples {self._skl.min_samples} to transform") + logger.info(f"Using eps {self._skl.eps} and min_samples" + f"{self._skl.min_samples} to transform") clustering = self._skl.fit(fullX) inliers = np.where(clustering.labels_[-num_X:] == -1, 0, 1)