Merge pull request #15 from emergentmethods/improve-api

improve api, bump version
emergentmethods · Jun 6, 2023 · ae8ef2a · ae8ef2a
2 parents dcaf145 + 943bfc3
commit ae8ef2a
Show file tree

Hide file tree

Showing 14 changed files with 190 additions and 136 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ DataSieve is very similar to the SKlearn Pipeline in that it:
 - transforms subsequent arrays of the same dimension according to the fit from the original X
 - inverse transforms arrays by inverting the series of transformations
 
-This means that it follows the SKLearn API very closely, and in fact most of the methods inherit directly from SKLearn methods.
+This means that it follows the SKLearn API very closely, and in fact users can use SKLearn transforms directly without making any modifications.
 
 The main **difference** is that DataSieve allows for the manipulation of the y and sample_weight arrays in addition to the X array. This is useful if you find yourself wishing to use the SKLearn pipeline for:
 
@@ -20,66 +20,21 @@ The main **difference** is that DataSieve allows for the manipulation of the y a
 
 These improved flexibilities allow for more customized/creative transformations. For example, the included `DataSieveDBSCAN` has automated parameter fitting and outlier removal based on clustering. 
 
-An example would be someone who wants to use `SGDOneClassSVM` to detect and remove outliers from their data set before training:
-
-```python
-class SVMOutlierExtractor(SGDOneClassSVM):
-    """
-    A subclass of the SKLearn SGDOneClassSVM that adds a transform() method
-    for removing detected outliers from X (as well as the associated y and
-    sample_weight if they are also furnished.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        self.fit(X, y, sample_weight=sample_weight)
-        return self.transform(X, y, sample_weight=sample_weight)
-
-    def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        super().fit(X, y=y, sample_weight=sample_weight)
-        return X, y, sample_weight, feature_list
-
-    def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        y_pred = self.predict(X)
-
-        X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred)
-
-        num_tossed = len(y_pred) - len(X)
-        if num_tossed > 0:
-            logger.info(
-                f"SVM detected {num_tossed} data points "
-                "as outliers."
-            )
-
-        return X, y, sample_weight, feature_list
-
-    def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        """
-        Unused, pass through X, y, sample_weight, and feature_list
-        """
-        return X, y, sample_weight, feature_list
-```
-
-
-As shown here, the `fit()` method is actually identical to the SKLearn `fit()` method, but the `transform()` removes data points from X, y, and sample_weight for any outliers detected in the `X` array.
-
 
-# Usage
-The user builds the pipeline similarly to SKLearn:
+## Usage
+The user builds the pipeline similarly to SKLearn, and can even use SKLearn transforms directly with the `SKLearnWrapper`:
 
 ```python
     from datasieve.pipeline import Pipeline
     from datasieve.transforms import DataSieveMinMaxScaler, DataSievePCA, DataSieveVarianceThreshold, SVMOutlierExtractor
+    from datasieve.transforms import SKlearnWrapper
 
     feature_pipeline = Pipeline([
         ("detect_constants", DataSieveVarianceThreshold(threshold=0)),
-        ("pre_svm_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))),
+        ("pre_svm_scaler", SKlearnWrapper(MinMaxScaler(feature_range=(-1, 1)))),
         ("svm", SVMOutlierExtractor()),
-        ("pre_pca_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))),
-        ("pca", DataSievePCA(n_components=0.95),
-        ("post_pca_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1)))
+        ("pca", DataSievePCA(n_components=0.95)),
+        ("post_pca_scaler", SKlearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
     ])
 
 ```
@@ -96,7 +51,6 @@ Next, the `feature_pipeline` can then be used to transform other datasets with t
 
 ```python
 X2, _, _ = feature_pipeline.transform(X2)
-
 ```
 
 Finally, similar to SKLearn's pipeline, the `feature_pipeline` can be used to inverse_transform an array `X3` array that has the same dimensions as the returned `X` array from the pipeline:
@@ -105,6 +59,58 @@ Finally, similar to SKLearn's pipeline, the `feature_pipeline` can be used to in
 Xinv, _ ,_ = feature_pipeline.inverse_transform(X)
 ```
 
+
+## Creating a custom transform
+
+An example would be someone who wants to use `SGDOneClassSVM` to detect and remove outliers from their data set before training:
+
+```python
+class SVMOutlierExtractor(BaseTransform):
+    """
+    A subclass of the SKLearn SGDOneClassSVM that adds a transform() method
+    for removing detected outliers from X (as well as the associated y and
+    sample_weight if they are also furnished.
+    """
+
+    def __init__(self, **kwargs):
+        self._skl = SGDOneClassSVM(**kwargs)
+
+    def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        self.fit(X, y, sample_weight=sample_weight)
+        return self.transform(X, y, sample_weight, feature_list)
+
+    def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        self._skl.fit(X, y=y, sample_weight=sample_weight)
+        return X, y, sample_weight, feature_list
+
+    def transform(self, X, y=None, sample_weight=None, feature_list=None,
+                  outlier_check=False, **kwargs):
+        y_pred = self._skl.predict(X)
+        y_pred = np.where(y_pred == -1, 0, y_pred)
+        if not outlier_check:
+            X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred)
+            num_tossed = len(y_pred) - len(X)
+            if num_tossed > 0:
+                logger.info(
+                    f"SVM detected {num_tossed} data points "
+                    "as outliers."
+                )
+        else:
+            y += y_pred
+            y -= 1
+
+        return X, y, sample_weight, feature_list
+
+    def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        """
+        Unused
+        """
+        return X, y, sample_weight, feature_list
+```
+
+
+As shown here, the `fit()` method is actually identical to the SKLearn `fit()` method, but the `transform()` removes data points from X, y, and sample_weight for any outliers detected in the `X` array.
+
 ## Data removal
 
 The command `feature_pipeline.fit_transform(X, y, sample_weight)` fits each pipeline step to `X`, and transforms `X` according to each step's `transform()` method. In some cases, this will not affect `y` or `sample_weight`. For example, `FlowdaptMinMaxScaler` simply scales `X` and saves the normalization information.  Meanwhile, in the `SVMOutlierExtractor`, `.fit()` will fit an SVM to `X` and `.transform()` will remove any detected outliers from `X`. Typical `Scikit-Learn` pipelines do not remove those data points from `y` and `sample_weight`. Luckily, the `FlowdaptPipeline` takes care of the "associated removal" of the same outlier data points from `y` and `sample_weight`. 

diff --git a/datasieve/pipeline.py b/datasieve/pipeline.py
@@ -23,6 +23,7 @@ def __init__(self, steps: List[Tuple] = [],
         self.pandas_types: bool = False
         self.feature_list: list = []
         self.label_list: list = []
+        self.step_strings: list = []
 
     def _validate_fitparams(self, fitparams: Dict[str, dict], steps: List[Tuple]):
         for _, (name, _) in enumerate(steps):
@@ -38,6 +39,19 @@ def __getitem__(self, name: str):
 
         logger.warning(f"Could not find step {name} in pipeline, returning None")
         return None
+
+    def append(self, step: Tuple[str, object], fitparams: dict = {}):
+        """
+        Append a step to the pipeline
+        :param step: tuple of (str, transform())
+        :param fitparams: dictionary of parameters to pass to fit
+        """
+        if step[0] in self.step_strings:
+            raise ValueError(f"Step name {step[0]} already exists in pipeline."
+                             "Ensure each step has a unique name.")
+        self.step_strings.append(step[0])
+        self.steps += [step]
+        self.fitparams[step[0]] = fitparams
 
     def fit_transform(self, X, y=None, sample_weight=None) -> Tuple[npt.ArrayLike,
                                                                     npt.ArrayLike,

diff --git a/datasieve/transforms/__init__.py b/datasieve/transforms/__init__.py
@@ -2,14 +2,14 @@
 from datasieve.transforms.svm_outlier_extractor import SVMOutlierExtractor
 from datasieve.transforms.pca import DataSievePCA
 from datasieve.transforms.dbscan import DataSieveDBSCAN
-from datasieve.transforms.minmax_scaler import DataSieveMinMaxScaler
 from datasieve.transforms.variance_threshold import DataSieveVarianceThreshold
+from datasieve.transforms.sklearn_wrapper import SKLearnWrapper
 
 __all__ = (
     "DissimilarityIndex",
     "SVMOutlierExtractor",
     "DataSievePCA",
     "DataSieveDBSCAN",
-    "DataSieveMinMaxScaler",
     "DataSieveVarianceThreshold",
+    "SKLearnWrapper",
 )
diff --git a/datasieve/transforms/base_transform.py b/datasieve/transforms/base_transform.py
@@ -0,0 +1,22 @@
+from abc import ABC
+
+
+class BaseTransform(ABC):
+    """
+    Base class for all transforms.
+    """
+
+    def __init__(self, name: str):
+        self.name = name
+
+    def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        return X, y, sample_weight, feature_list
+
+    def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        return X, y, sample_weight, feature_list
+
+    def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        return X, y, sample_weight, feature_list
+
+    def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+        return X, y, sample_weight, feature_list
diff --git a/datasieve/transforms/dbscan.py b/datasieve/transforms/dbscan.py
@@ -5,11 +5,12 @@
 from sklearn.cluster import DBSCAN
 from sklearn.neighbors import NearestNeighbors
 from datasieve.utils import remove_outliers
+from datasieve.transforms.base_transform import BaseTransform
 
 logger = logging.getLogger('datasieve.pipeline')
 
 
-class DataSieveDBSCAN(DBSCAN):
+class DataSieveDBSCAN(BaseTransform):
     """
     A subclass of the SKLearn DBSCAN that ensures fit, transform, fit_transform and
     inverse_transform all take the full set of params X, y, sample_weight (even if they
@@ -22,7 +23,7 @@ class DataSieveDBSCAN(DBSCAN):
     """
 
     def __init__(self, backend="loky", n_jobs=-1, **kwargs) -> None:
-        super().__init__(**kwargs)
+        self._skl: DBSCAN = DBSCAN(**kwargs)
         self.train_features: npt.ArrayLike = np.array([])
         self.backend = backend
         self.n_jobs = n_jobs
@@ -38,13 +39,13 @@ def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwar
         # appends X to the self.train_features in order to determine
         # outliers, so we avoid that duplication by ensuring that
         # fit_transform simply uses the primary train_features only.
-        inliers = np.where(self.labels_ == -1, 0, 1)
+        inliers = np.where(self._skl.labels_ == -1, 0, 1)
 
         X, y, sample_weight = remove_outliers(X, y, sample_weight, inliers)
 
         logger.info(
             f"DBSCAN tossed {len(inliers) - X.shape[0]}"
-            f" train points from {len(self.labels_)} in fit_transform()"
+            f" train points from {len(self._skl.labels_)} in fit_transform()"
         )
 
         return X, y, sample_weight, feature_list
@@ -54,11 +55,11 @@ def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
         Given a set of training features, find the best
         epsilond and min_samples
         """
-        self.eps, self.min_samples = self.compute_epsilon_and_minpts(X)
-        logger.info(f"Found eps {self.eps} and min_samples {self.min_samples} in fit")
+        self._skl.eps, self._skl.min_samples = self.compute_epsilon_and_minpts(X)
+        logger.info(f"Found eps {self._skl.eps} and min_samples {self._skl.eps} in fit")
 
         with parallel_backend(self.backend, n_jobs=self.n_jobs):
-            super().fit(X)
+            self._skl.fit(X)
 
         self.train_features = X
 
@@ -75,28 +76,29 @@ def transform(self, X, y=None, sample_weight=None, feature_list=None,
         fullX = np.concatenate([self.train_features, X], axis=0)
 
         with parallel_backend(self.backend, n_jobs=self.n_jobs):
-            logger.info(f"Using eps {self.eps} and min_samples {self.min_samples} to transform")
-            clustering = super().fit(fullX)
+            logger.info(f"Using eps {self._skl.eps} and min_samples"
+                        f"{self._skl.min_samples} to transform")
+            clustering = self._skl.fit(fullX)
 
         inliers = np.where(clustering.labels_[-num_X:] == -1, 0, 1)
 
         if not outlier_check:
             X, y, sample_weight = remove_outliers(X, y, sample_weight, inliers=inliers)
             logger.info(
                 f"DBSCAN tossed {len(inliers) - X.shape[0]}"
-                f" train points from {len(self.labels_)} in transform()"
+                f" train points from {len(self._skl.labels_)} in transform()"
             )
         else:
             y += inliers
             y -= 1
 
         return X, y, sample_weight, feature_list
 
-    def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        """
-        Unused
-        """
-        return X, y, sample_weight, feature_list
+    # def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
+    #     """
+    #     Unused
+    #     """
+    #     return X, y, sample_weight, feature_list
 
     def compute_epsilon_and_minpts(self, X):
         """

diff --git a/datasieve/transforms/dissimilarity_index.py b/datasieve/transforms/dissimilarity_index.py
@@ -1,5 +1,6 @@
 import logging
 from sklearn.metrics.pairwise import pairwise_distances
+from datasieve.transforms.base_transform import BaseTransform
 import numpy as np
 import numpy.typing as npt
 from joblib import parallel_backend
@@ -8,7 +9,7 @@
 logger = logging.getLogger('datasieve.pipeline')
 
 
-class DissimilarityIndex:
+class DissimilarityIndex(BaseTransform):
     """
     Object designed for computing the dissimilarity index for a set of training data and
     prediction points. fit() computes the avg_mean distance for the training data and

diff --git a/datasieve/transforms/minmax_scaler.py b/datasieve/transforms/minmax_scaler.py