Skip to content

Commit

Permalink
Merge pull request #15 from emergentmethods/improve-api
Browse files Browse the repository at this point in the history
improve api, bump version
  • Loading branch information
robcaulk authored Jun 6, 2023
2 parents dcaf145 + 943bfc3 commit ae8ef2a
Show file tree
Hide file tree
Showing 14 changed files with 190 additions and 136 deletions.
112 changes: 59 additions & 53 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ DataSieve is very similar to the SKlearn Pipeline in that it:
- transforms subsequent arrays of the same dimension according to the fit from the original X
- inverse transforms arrays by inverting the series of transformations

This means that it follows the SKLearn API very closely, and in fact most of the methods inherit directly from SKLearn methods.
This means that it follows the SKLearn API very closely, and in fact users can use SKLearn transforms directly without making any modifications.

The main **difference** is that DataSieve allows for the manipulation of the y and sample_weight arrays in addition to the X array. This is useful if you find yourself wishing to use the SKLearn pipeline for:

Expand All @@ -20,66 +20,21 @@ The main **difference** is that DataSieve allows for the manipulation of the y a

These improved flexibilities allow for more customized/creative transformations. For example, the included `DataSieveDBSCAN` has automated parameter fitting and outlier removal based on clustering.

An example would be someone who wants to use `SGDOneClassSVM` to detect and remove outliers from their data set before training:

```python
class SVMOutlierExtractor(SGDOneClassSVM):
"""
A subclass of the SKLearn SGDOneClassSVM that adds a transform() method
for removing detected outliers from X (as well as the associated y and
sample_weight if they are also furnished.
"""

def __init__(self, **kwargs):
super().__init__(**kwargs)

def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
self.fit(X, y, sample_weight=sample_weight)
return self.transform(X, y, sample_weight=sample_weight)

def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
super().fit(X, y=y, sample_weight=sample_weight)
return X, y, sample_weight, feature_list

def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
y_pred = self.predict(X)

X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred)

num_tossed = len(y_pred) - len(X)
if num_tossed > 0:
logger.info(
f"SVM detected {num_tossed} data points "
"as outliers."
)

return X, y, sample_weight, feature_list

def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
"""
Unused, pass through X, y, sample_weight, and feature_list
"""
return X, y, sample_weight, feature_list
```


As shown here, the `fit()` method is actually identical to the SKLearn `fit()` method, but the `transform()` removes data points from X, y, and sample_weight for any outliers detected in the `X` array.


# Usage
The user builds the pipeline similarly to SKLearn:
## Usage
The user builds the pipeline similarly to SKLearn, and can even use SKLearn transforms directly with the `SKLearnWrapper`:

```python
from datasieve.pipeline import Pipeline
from datasieve.transforms import DataSieveMinMaxScaler, DataSievePCA, DataSieveVarianceThreshold, SVMOutlierExtractor
from datasieve.transforms import SKlearnWrapper

feature_pipeline = Pipeline([
("detect_constants", DataSieveVarianceThreshold(threshold=0)),
("pre_svm_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))),
("pre_svm_scaler", SKlearnWrapper(MinMaxScaler(feature_range=(-1, 1)))),
("svm", SVMOutlierExtractor()),
("pre_pca_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1))),
("pca", DataSievePCA(n_components=0.95),
("post_pca_scaler", DataSieveMinMaxScaler(feature_range=(-1, 1)))
("pca", DataSievePCA(n_components=0.95)),
("post_pca_scaler", SKlearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
])

```
Expand All @@ -96,7 +51,6 @@ Next, the `feature_pipeline` can then be used to transform other datasets with t

```python
X2, _, _ = feature_pipeline.transform(X2)

```

Finally, similar to SKLearn's pipeline, the `feature_pipeline` can be used to inverse_transform an array `X3` array that has the same dimensions as the returned `X` array from the pipeline:
Expand All @@ -105,6 +59,58 @@ Finally, similar to SKLearn's pipeline, the `feature_pipeline` can be used to in
Xinv, _ ,_ = feature_pipeline.inverse_transform(X)
```


## Creating a custom transform

An example would be someone who wants to use `SGDOneClassSVM` to detect and remove outliers from their data set before training:

```python
class SVMOutlierExtractor(BaseTransform):
"""
A subclass of the SKLearn SGDOneClassSVM that adds a transform() method
for removing detected outliers from X (as well as the associated y and
sample_weight if they are also furnished.
"""

def __init__(self, **kwargs):
self._skl = SGDOneClassSVM(**kwargs)

def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
self.fit(X, y, sample_weight=sample_weight)
return self.transform(X, y, sample_weight, feature_list)

def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
self._skl.fit(X, y=y, sample_weight=sample_weight)
return X, y, sample_weight, feature_list

def transform(self, X, y=None, sample_weight=None, feature_list=None,
outlier_check=False, **kwargs):
y_pred = self._skl.predict(X)
y_pred = np.where(y_pred == -1, 0, y_pred)
if not outlier_check:
X, y, sample_weight = remove_outliers(X, y, sample_weight, y_pred)
num_tossed = len(y_pred) - len(X)
if num_tossed > 0:
logger.info(
f"SVM detected {num_tossed} data points "
"as outliers."
)
else:
y += y_pred
y -= 1

return X, y, sample_weight, feature_list

def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
"""
Unused
"""
return X, y, sample_weight, feature_list
```


As shown here, the `fit()` method is actually identical to the SKLearn `fit()` method, but the `transform()` removes data points from X, y, and sample_weight for any outliers detected in the `X` array.

## Data removal

The command `feature_pipeline.fit_transform(X, y, sample_weight)` fits each pipeline step to `X`, and transforms `X` according to each step's `transform()` method. In some cases, this will not affect `y` or `sample_weight`. For example, `FlowdaptMinMaxScaler` simply scales `X` and saves the normalization information. Meanwhile, in the `SVMOutlierExtractor`, `.fit()` will fit an SVM to `X` and `.transform()` will remove any detected outliers from `X`. Typical `Scikit-Learn` pipelines do not remove those data points from `y` and `sample_weight`. Luckily, the `FlowdaptPipeline` takes care of the "associated removal" of the same outlier data points from `y` and `sample_weight`.
Expand Down
14 changes: 14 additions & 0 deletions datasieve/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, steps: List[Tuple] = [],
self.pandas_types: bool = False
self.feature_list: list = []
self.label_list: list = []
self.step_strings: list = []

def _validate_fitparams(self, fitparams: Dict[str, dict], steps: List[Tuple]):
for _, (name, _) in enumerate(steps):
Expand All @@ -38,6 +39,19 @@ def __getitem__(self, name: str):

logger.warning(f"Could not find step {name} in pipeline, returning None")
return None

def append(self, step: Tuple[str, object], fitparams: dict = {}):
"""
Append a step to the pipeline
:param step: tuple of (str, transform())
:param fitparams: dictionary of parameters to pass to fit
"""
if step[0] in self.step_strings:
raise ValueError(f"Step name {step[0]} already exists in pipeline."
"Ensure each step has a unique name.")
self.step_strings.append(step[0])
self.steps += [step]
self.fitparams[step[0]] = fitparams

def fit_transform(self, X, y=None, sample_weight=None) -> Tuple[npt.ArrayLike,
npt.ArrayLike,
Expand Down
4 changes: 2 additions & 2 deletions datasieve/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
from datasieve.transforms.svm_outlier_extractor import SVMOutlierExtractor
from datasieve.transforms.pca import DataSievePCA
from datasieve.transforms.dbscan import DataSieveDBSCAN
from datasieve.transforms.minmax_scaler import DataSieveMinMaxScaler
from datasieve.transforms.variance_threshold import DataSieveVarianceThreshold
from datasieve.transforms.sklearn_wrapper import SKLearnWrapper

__all__ = (
"DissimilarityIndex",
"SVMOutlierExtractor",
"DataSievePCA",
"DataSieveDBSCAN",
"DataSieveMinMaxScaler",
"DataSieveVarianceThreshold",
"SKLearnWrapper",
)
22 changes: 22 additions & 0 deletions datasieve/transforms/base_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from abc import ABC


class BaseTransform(ABC):
"""
Base class for all transforms.
"""

def __init__(self, name: str):
self.name = name

def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
return X, y, sample_weight, feature_list

def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
return X, y, sample_weight, feature_list

def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
return X, y, sample_weight, feature_list

def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
return X, y, sample_weight, feature_list
32 changes: 17 additions & 15 deletions datasieve/transforms/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from datasieve.utils import remove_outliers
from datasieve.transforms.base_transform import BaseTransform

logger = logging.getLogger('datasieve.pipeline')


class DataSieveDBSCAN(DBSCAN):
class DataSieveDBSCAN(BaseTransform):
"""
A subclass of the SKLearn DBSCAN that ensures fit, transform, fit_transform and
inverse_transform all take the full set of params X, y, sample_weight (even if they
Expand All @@ -22,7 +23,7 @@ class DataSieveDBSCAN(DBSCAN):
"""

def __init__(self, backend="loky", n_jobs=-1, **kwargs) -> None:
super().__init__(**kwargs)
self._skl: DBSCAN = DBSCAN(**kwargs)
self.train_features: npt.ArrayLike = np.array([])
self.backend = backend
self.n_jobs = n_jobs
Expand All @@ -38,13 +39,13 @@ def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwar
# appends X to the self.train_features in order to determine
# outliers, so we avoid that duplication by ensuring that
# fit_transform simply uses the primary train_features only.
inliers = np.where(self.labels_ == -1, 0, 1)
inliers = np.where(self._skl.labels_ == -1, 0, 1)

X, y, sample_weight = remove_outliers(X, y, sample_weight, inliers)

logger.info(
f"DBSCAN tossed {len(inliers) - X.shape[0]}"
f" train points from {len(self.labels_)} in fit_transform()"
f" train points from {len(self._skl.labels_)} in fit_transform()"
)

return X, y, sample_weight, feature_list
Expand All @@ -54,11 +55,11 @@ def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
Given a set of training features, find the best
epsilond and min_samples
"""
self.eps, self.min_samples = self.compute_epsilon_and_minpts(X)
logger.info(f"Found eps {self.eps} and min_samples {self.min_samples} in fit")
self._skl.eps, self._skl.min_samples = self.compute_epsilon_and_minpts(X)
logger.info(f"Found eps {self._skl.eps} and min_samples {self._skl.eps} in fit")

with parallel_backend(self.backend, n_jobs=self.n_jobs):
super().fit(X)
self._skl.fit(X)

self.train_features = X

Expand All @@ -75,28 +76,29 @@ def transform(self, X, y=None, sample_weight=None, feature_list=None,
fullX = np.concatenate([self.train_features, X], axis=0)

with parallel_backend(self.backend, n_jobs=self.n_jobs):
logger.info(f"Using eps {self.eps} and min_samples {self.min_samples} to transform")
clustering = super().fit(fullX)
logger.info(f"Using eps {self._skl.eps} and min_samples"
f"{self._skl.min_samples} to transform")
clustering = self._skl.fit(fullX)

inliers = np.where(clustering.labels_[-num_X:] == -1, 0, 1)

if not outlier_check:
X, y, sample_weight = remove_outliers(X, y, sample_weight, inliers=inliers)
logger.info(
f"DBSCAN tossed {len(inliers) - X.shape[0]}"
f" train points from {len(self.labels_)} in transform()"
f" train points from {len(self._skl.labels_)} in transform()"
)
else:
y += inliers
y -= 1

return X, y, sample_weight, feature_list

def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
"""
Unused
"""
return X, y, sample_weight, feature_list
# def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
# """
# Unused
# """
# return X, y, sample_weight, feature_list

def compute_epsilon_and_minpts(self, X):
"""
Expand Down
3 changes: 2 additions & 1 deletion datasieve/transforms/dissimilarity_index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from sklearn.metrics.pairwise import pairwise_distances
from datasieve.transforms.base_transform import BaseTransform
import numpy as np
import numpy.typing as npt
from joblib import parallel_backend
Expand All @@ -8,7 +9,7 @@
logger = logging.getLogger('datasieve.pipeline')


class DissimilarityIndex:
class DissimilarityIndex(BaseTransform):
"""
Object designed for computing the dissimilarity index for a set of training data and
prediction points. fit() computes the avg_mean distance for the training data and
Expand Down
32 changes: 0 additions & 32 deletions datasieve/transforms/minmax_scaler.py

This file was deleted.

Loading

0 comments on commit ae8ef2a

Please sign in to comment.