SHAPRFECV speedup for bigger use-cases and some simple refactoring. (#…

…252) This PR depends on the PR to be accepted: #248 ______ This cleanup removes some more unused code and simplifies parts of our implementations. It should allow for a boost in performance for bigger use-cases, although minimal. Also fix: - [x] #242 comments - [x] #255 - [x] #245
ing-bank · Apr 23, 2024 · c1a6889 · c1a6889
1 parent ab672d4
commit c1a6889
Show file tree

Hide file tree

Showing 19 changed files with 179,606 additions and 5,608 deletions.
diff --git a/LICENCE b/LICENCE
@@ -1,4 +1,4 @@
-Copyright (c) 2020 ING Bank N.V.
+Copyright (c) ING Bank N.V.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

diff --git a/docs/tutorials/nb_shap_feature_elimination.ipynb b/docs/tutorials/nb_shap_feature_elimination.ipynb
diff --git a/probatus/feature_elimination/__init__.py b/probatus/feature_elimination/__init__.py
@@ -1,3 +1,4 @@
-from .feature_elimination import ShapRFECV, EarlyStoppingShapRFECV
+from .feature_elimination import ShapRFECV
+from .early_stopping_feature_elimination import EarlyStoppingShapRFECV
 
 __all__ = ["ShapRFECV", "EarlyStoppingShapRFECV"]
diff --git a/probatus/feature_elimination/early_stopping_feature_elimination.py b/probatus/feature_elimination/early_stopping_feature_elimination.py
diff --git a/probatus/feature_elimination/feature_elimination.py b/probatus/feature_elimination/feature_elimination.py
diff --git a/probatus/interpret/model_interpret.py b/probatus/interpret/model_interpret.py
@@ -9,9 +9,9 @@
     BaseFitComputePlotClass,
     assure_list_of_strings,
     calculate_shap_importance,
-    get_single_scorer,
     preprocess_data,
     preprocess_labels,
+    get_single_scorer,
     shap_calc,
 )
 

diff --git a/probatus/sample_similarity/resemblance_model.py b/probatus/sample_similarity/resemblance_model.py
@@ -8,7 +8,7 @@
 from sklearn.inspection import permutation_importance
 from sklearn.model_selection import train_test_split
 
-from probatus.utils import BaseFitComputePlotClass, get_single_scorer, preprocess_data, preprocess_labels
+from probatus.utils import BaseFitComputePlotClass, preprocess_data, preprocess_labels, get_single_scorer
 from probatus.utils.shap_helpers import calculate_shap_importance, shap_calc
 
 
@@ -108,10 +108,6 @@ def fit(self, X1, X2, column_names=None, class_names=None):
             (BaseResemblanceModel):
                 Fitted object
         """
-        # Set seed for results reproducibility
-        if self.random_state is not None:
-            np.random.seed(self.random_state)
-
         # Set class names
         self.class_names = class_names
         if self.class_names is None:

diff --git a/probatus/utils/__init__.py b/probatus/utils/__init__.py
@@ -1,29 +1,27 @@
-from .exceptions import NotFittedError, UnsupportedModelError
-from .scoring import Scorer, get_scorers, get_single_scorer
+from .exceptions import NotFittedError
 from .arrayfuncs import (
     assure_pandas_df,
     assure_pandas_series,
     preprocess_data,
     preprocess_labels,
 )
+from .scoring import Scorer, get_single_scorer
 from .shap_helpers import shap_calc, shap_to_df, calculate_shap_importance
 from ._utils import assure_list_of_strings
 from .base_class_interface import BaseFitComputeClass, BaseFitComputePlotClass
 
 __all__ = [
-    "NotFittedError",
-    "UnsupportedModelError",
-    "Scorer",
-    "assure_pandas_df",
-    "get_scorers",
     "assure_list_of_strings",
-    "shap_calc",
-    "shap_to_df",
-    "calculate_shap_importance",
+    "assure_pandas_df",
     "assure_pandas_series",
     "preprocess_data",
     "preprocess_labels",
     "BaseFitComputeClass",
     "BaseFitComputePlotClass",
+    "NotFittedError",
     "get_single_scorer",
+    "Scorer",
+    "shap_calc",
+    "shap_to_df",
+    "calculate_shap_importance",
 ]
diff --git a/probatus/utils/arrayfuncs.py b/probatus/utils/arrayfuncs.py
@@ -15,21 +15,15 @@ def assure_pandas_df(x, column_names=None):
         pandas DataFrame
     """
     if isinstance(x, pd.DataFrame):
-        # Check if column_names are passed correctly
         if column_names is not None:
             x.columns = column_names
-        return x
-    elif any(
-        [
-            isinstance(x, np.ndarray),
-            isinstance(x, pd.core.series.Series),
-            isinstance(x, list),
-        ]
-    ):
-        return pd.DataFrame(x, columns=column_names)
+    elif isinstance(x, (np.ndarray, pd.Series, list)):
+        x = pd.DataFrame(x, columns=column_names)
     else:
         raise TypeError("Please supply a list, numpy array, pandas Series or pandas DataFrame")
 
+    return x
+
 
 def assure_pandas_series(x, index=None):
     """
@@ -42,7 +36,7 @@ def assure_pandas_series(x, index=None):
         pandas Series
     """
     if isinstance(x, pd.Series):
-        if isinstance(index, list) or isinstance(index, np.ndarray):
+        if isinstance(index, (list, np.ndarray)):
             index = pd.Index(index)
         current_x_index = pd.Index(x.index.values)
         if current_x_index.equals(index):
@@ -55,7 +49,7 @@ def assure_pandas_series(x, index=None):
             # If indexes have different values, overwrite
             x.index = index
             return x
-    elif any([isinstance(x, np.ndarray), isinstance(x, list)]):
+    elif any([isinstance(x, (np.ndarray, list))]):
         return pd.Series(x, index=index)
     else:
         raise TypeError("Please supply a list, numpy array, pandas Series")
@@ -92,40 +86,36 @@ def preprocess_data(X, X_name=None, column_names=None, verbose=0):
         (pd.DataFrame):
             Preprocessed dataset.
     """
-    if X_name is None:
-        X_name = "X"
+    X_name = "X" if X_name is None else X_name
 
     # Make sure that X is a pd.DataFrame with correct column names
     X = assure_pandas_df(X, column_names=column_names)
 
-    # Warn if missing
-    columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
-    if len(columns_with_missing) > 0:
-        if verbose > 0:
+    if verbose > 0:
+        # Warn if missing
+        columns_with_missing = X.columns[X.isnull().any()].tolist()
+        if columns_with_missing:
             warnings.warn(
                 f"The following variables in {X_name} contains missing values {columns_with_missing}. "
                 f"Make sure to impute missing or apply a model that handles them automatically."
             )
 
-    # Warn if categorical features and change to category
-    indices_categorical_features = [
-        column[0] for column in enumerate(X.dtypes) if column[1].name in ["category", "object"]
-    ]
-    categorical_features = list(X.columns[indices_categorical_features])
-
-    # Set categorical features type to category
-    if len(categorical_features) > 0:
-        if verbose > 0:
-            warnings.warn(
-                f"The following variables in {X_name} contains categorical variables: "
-                f"{categorical_features}. Make sure to use a model that handles them automatically or "
-                f"encode them into numerical variables."
-            )
+        # Warn if categorical features and change to category
+        categorical_features = X.select_dtypes(include=["category", "object"]).columns.tolist()
+        # Set categorical features type to category
+        if categorical_features:
+            if verbose > 0:
+                warnings.warn(
+                    f"The following variables in {X_name} contains categorical variables: "
+                    f"{categorical_features}. Make sure to use a model that handles them automatically or "
+                    f"encode them into numerical variables."
+                )
+
+    # Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
+    object_columns = X.select_dtypes(include=["object"]).columns
+    if not object_columns.empty:
+        X[object_columns] = X[object_columns].astype("category")
 
-        # Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
-        for categorical_feature in categorical_features:
-            if X[categorical_feature].dtype.name == "object":
-                X[categorical_feature] = X[categorical_feature].astype("category")
     return X, X.columns.tolist()
 
 
@@ -157,8 +147,7 @@ def preprocess_labels(y, y_name=None, index=None, verbose=0):
         (pd.Series):
             Labels in the form of pd.Series.
     """
-    if y_name is None:
-        y_name = "y"
+    y_name = "y" if y_name is None else y_name
 
     # Make sure that y is a series with correct index
     y = assure_pandas_series(y, index=index)

diff --git a/probatus/utils/exceptions.py b/probatus/utils/exceptions.py
@@ -8,16 +8,3 @@ def __init__(self, message):
         Init error.
         """
         self.message = message
-
-
-class UnsupportedModelError(Exception):
-    """
-    Error.
-    """
-
-    def __init__(self, message):
-        # TODO: Add this check for unsupported models to our implementations.
-        """
-        Init error.
-        """
-        self.message = message
diff --git a/probatus/utils/scoring.py b/probatus/utils/scoring.py
@@ -1,33 +1,9 @@
 from sklearn.metrics import get_scorer
 
 
-def get_scorers(scoring):
-    """
-    Returns Scorers list based on the provided scoring.
-
-    Args:
-        scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers):
-            Metrics for which the score is calculated. It can be either a name or list of names metric names and
-            needs to be aligned with predefined classification scorers names in sklearn
-            ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)).
-            Another option is using probatus.utils.Scorer to define a custom metric.
-
-    Returns:
-        (list of probatus.utils.Scorer):
-            List of scorers that can be used for scoring models
-    """
-    scorers = []
-    if isinstance(scoring, list):
-        for scorer in scoring:
-            scorers.append(get_single_scorer(scorer))
-    else:
-        scorers.append(get_single_scorer(scoring))
-    return scorers
-
-
 def get_single_scorer(scoring):
     """
-    Returns single Scorer, based on provided input in scoring argument.
+    Returns Scorer, based on provided input in scoring argument.
 
     Args:
         scoring (string or probatus.utils.Scorer, optional):
@@ -67,7 +43,7 @@ class Scorer:
 
     # Make custom scorer with following function:
     def custom_metric(y_true, y_pred):
-          return (y_true == y_pred).sum()
+         return (y_true == y_pred).sum()
     scorer2 = Scorer('custom_metric', custom_scorer=make_scorer(custom_metric))
 
     # Prepare two samples
@@ -110,7 +86,7 @@ def score(self, model, X, y):
         """
         Scores the samples model based on the provided metric name.
 
-        Args:
+        Args
             model (model object):
                 Model to be scored.