Skip to content

Commit

Permalink
SHAPRFECV speedup for bigger use-cases and some simple refactoring. (#…
Browse files Browse the repository at this point in the history
…252)

This PR depends on the PR to be accepted:
#248

______

This cleanup removes some more unused code and simplifies parts of our
implementations. It should allow for a boost in performance for bigger
use-cases, although minimal.

Also fix:

- [x] #242 comments 
- [x] #255
- [x] #245
  • Loading branch information
Reinier Koops authored Apr 23, 2024
1 parent ab672d4 commit c1a6889
Show file tree
Hide file tree
Showing 19 changed files with 179,606 additions and 5,608 deletions.
2 changes: 1 addition & 1 deletion LICENCE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2020 ING Bank N.V.
Copyright (c) ING Bank N.V.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

Expand Down
182,798 changes: 178,569 additions & 4,229 deletions docs/tutorials/nb_shap_feature_elimination.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion probatus/feature_elimination/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .feature_elimination import ShapRFECV, EarlyStoppingShapRFECV
from .feature_elimination import ShapRFECV
from .early_stopping_feature_elimination import EarlyStoppingShapRFECV

__all__ = ["ShapRFECV", "EarlyStoppingShapRFECV"]
543 changes: 543 additions & 0 deletions probatus/feature_elimination/early_stopping_feature_elimination.py

Large diffs are not rendered by default.

1,398 changes: 416 additions & 982 deletions probatus/feature_elimination/feature_elimination.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion probatus/interpret/model_interpret.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
BaseFitComputePlotClass,
assure_list_of_strings,
calculate_shap_importance,
get_single_scorer,
preprocess_data,
preprocess_labels,
get_single_scorer,
shap_calc,
)

Expand Down
6 changes: 1 addition & 5 deletions probatus/sample_similarity/resemblance_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

from probatus.utils import BaseFitComputePlotClass, get_single_scorer, preprocess_data, preprocess_labels
from probatus.utils import BaseFitComputePlotClass, preprocess_data, preprocess_labels, get_single_scorer
from probatus.utils.shap_helpers import calculate_shap_importance, shap_calc


Expand Down Expand Up @@ -108,10 +108,6 @@ def fit(self, X1, X2, column_names=None, class_names=None):
(BaseResemblanceModel):
Fitted object
"""
# Set seed for results reproducibility
if self.random_state is not None:
np.random.seed(self.random_state)

# Set class names
self.class_names = class_names
if self.class_names is None:
Expand Down
18 changes: 8 additions & 10 deletions probatus/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,27 @@
from .exceptions import NotFittedError, UnsupportedModelError
from .scoring import Scorer, get_scorers, get_single_scorer
from .exceptions import NotFittedError
from .arrayfuncs import (
assure_pandas_df,
assure_pandas_series,
preprocess_data,
preprocess_labels,
)
from .scoring import Scorer, get_single_scorer
from .shap_helpers import shap_calc, shap_to_df, calculate_shap_importance
from ._utils import assure_list_of_strings
from .base_class_interface import BaseFitComputeClass, BaseFitComputePlotClass

__all__ = [
"NotFittedError",
"UnsupportedModelError",
"Scorer",
"assure_pandas_df",
"get_scorers",
"assure_list_of_strings",
"shap_calc",
"shap_to_df",
"calculate_shap_importance",
"assure_pandas_df",
"assure_pandas_series",
"preprocess_data",
"preprocess_labels",
"BaseFitComputeClass",
"BaseFitComputePlotClass",
"NotFittedError",
"get_single_scorer",
"Scorer",
"shap_calc",
"shap_to_df",
"calculate_shap_importance",
]
65 changes: 27 additions & 38 deletions probatus/utils/arrayfuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,15 @@ def assure_pandas_df(x, column_names=None):
pandas DataFrame
"""
if isinstance(x, pd.DataFrame):
# Check if column_names are passed correctly
if column_names is not None:
x.columns = column_names
return x
elif any(
[
isinstance(x, np.ndarray),
isinstance(x, pd.core.series.Series),
isinstance(x, list),
]
):
return pd.DataFrame(x, columns=column_names)
elif isinstance(x, (np.ndarray, pd.Series, list)):
x = pd.DataFrame(x, columns=column_names)
else:
raise TypeError("Please supply a list, numpy array, pandas Series or pandas DataFrame")

return x


def assure_pandas_series(x, index=None):
"""
Expand All @@ -42,7 +36,7 @@ def assure_pandas_series(x, index=None):
pandas Series
"""
if isinstance(x, pd.Series):
if isinstance(index, list) or isinstance(index, np.ndarray):
if isinstance(index, (list, np.ndarray)):
index = pd.Index(index)
current_x_index = pd.Index(x.index.values)
if current_x_index.equals(index):
Expand All @@ -55,7 +49,7 @@ def assure_pandas_series(x, index=None):
# If indexes have different values, overwrite
x.index = index
return x
elif any([isinstance(x, np.ndarray), isinstance(x, list)]):
elif any([isinstance(x, (np.ndarray, list))]):
return pd.Series(x, index=index)
else:
raise TypeError("Please supply a list, numpy array, pandas Series")
Expand Down Expand Up @@ -92,40 +86,36 @@ def preprocess_data(X, X_name=None, column_names=None, verbose=0):
(pd.DataFrame):
Preprocessed dataset.
"""
if X_name is None:
X_name = "X"
X_name = "X" if X_name is None else X_name

# Make sure that X is a pd.DataFrame with correct column names
X = assure_pandas_df(X, column_names=column_names)

# Warn if missing
columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
if len(columns_with_missing) > 0:
if verbose > 0:
if verbose > 0:
# Warn if missing
columns_with_missing = X.columns[X.isnull().any()].tolist()
if columns_with_missing:
warnings.warn(
f"The following variables in {X_name} contains missing values {columns_with_missing}. "
f"Make sure to impute missing or apply a model that handles them automatically."
)

# Warn if categorical features and change to category
indices_categorical_features = [
column[0] for column in enumerate(X.dtypes) if column[1].name in ["category", "object"]
]
categorical_features = list(X.columns[indices_categorical_features])

# Set categorical features type to category
if len(categorical_features) > 0:
if verbose > 0:
warnings.warn(
f"The following variables in {X_name} contains categorical variables: "
f"{categorical_features}. Make sure to use a model that handles them automatically or "
f"encode them into numerical variables."
)
# Warn if categorical features and change to category
categorical_features = X.select_dtypes(include=["category", "object"]).columns.tolist()
# Set categorical features type to category
if categorical_features:
if verbose > 0:
warnings.warn(
f"The following variables in {X_name} contains categorical variables: "
f"{categorical_features}. Make sure to use a model that handles them automatically or "
f"encode them into numerical variables."
)

# Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
object_columns = X.select_dtypes(include=["object"]).columns
if not object_columns.empty:
X[object_columns] = X[object_columns].astype("category")

# Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
for categorical_feature in categorical_features:
if X[categorical_feature].dtype.name == "object":
X[categorical_feature] = X[categorical_feature].astype("category")
return X, X.columns.tolist()


Expand Down Expand Up @@ -157,8 +147,7 @@ def preprocess_labels(y, y_name=None, index=None, verbose=0):
(pd.Series):
Labels in the form of pd.Series.
"""
if y_name is None:
y_name = "y"
y_name = "y" if y_name is None else y_name

# Make sure that y is a series with correct index
y = assure_pandas_series(y, index=index)
Expand Down
13 changes: 0 additions & 13 deletions probatus/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,3 @@ def __init__(self, message):
Init error.
"""
self.message = message


class UnsupportedModelError(Exception):
"""
Error.
"""

def __init__(self, message):
# TODO: Add this check for unsupported models to our implementations.
"""
Init error.
"""
self.message = message
30 changes: 3 additions & 27 deletions probatus/utils/scoring.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,9 @@
from sklearn.metrics import get_scorer


def get_scorers(scoring):
"""
Returns Scorers list based on the provided scoring.
Args:
scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers):
Metrics for which the score is calculated. It can be either a name or list of names metric names and
needs to be aligned with predefined classification scorers names in sklearn
([link](https://scikit-learn.org/stable/modules/model_evaluation.html)).
Another option is using probatus.utils.Scorer to define a custom metric.
Returns:
(list of probatus.utils.Scorer):
List of scorers that can be used for scoring models
"""
scorers = []
if isinstance(scoring, list):
for scorer in scoring:
scorers.append(get_single_scorer(scorer))
else:
scorers.append(get_single_scorer(scoring))
return scorers


def get_single_scorer(scoring):
"""
Returns single Scorer, based on provided input in scoring argument.
Returns Scorer, based on provided input in scoring argument.
Args:
scoring (string or probatus.utils.Scorer, optional):
Expand Down Expand Up @@ -67,7 +43,7 @@ class Scorer:
# Make custom scorer with following function:
def custom_metric(y_true, y_pred):
return (y_true == y_pred).sum()
return (y_true == y_pred).sum()
scorer2 = Scorer('custom_metric', custom_scorer=make_scorer(custom_metric))
# Prepare two samples
Expand Down Expand Up @@ -110,7 +86,7 @@ def score(self, model, X, y):
"""
Scores the samples model based on the provided metric name.
Args:
Args
model (model object):
Model to be scored.
Expand Down
Loading

0 comments on commit c1a6889

Please sign in to comment.