Probatus 2.0.0: Bump version + add pre-commit hooks (#206)

ing-bank · Jun 8, 2023 · bd1698c · bd1698c
1 parent 5645d2e
commit bd1698c
Show file tree

Hide file tree

Showing 45 changed files with 480 additions and 456 deletions.
diff --git a/.github/workflows/cronjob_unit_tests.yml b/.github/workflows/cronjob_unit_tests.yml
@@ -24,7 +24,7 @@ jobs:
           - build: windows
             os: windows-latest
             SKIP_LIGHTGBM: False
-        python-version: [3.8, 3.9, "3.10"]
+        python-version: [3.8, 3.9, "3.10", "3.11"]
     steps:
     - uses: actions/checkout@master
 
@@ -46,10 +46,16 @@ jobs:
         pip3 install --upgrade setuptools pip
         pip3 install ".[all]"
 
-    - name: Run unit tests and linters
+    - name: Run linters
+      env:
+        SKIP_LIGHTGBM: ${{ matrix.SKIP_LIGHTGBM }}
+      run: |
+        pre-commit install
+        pre-commit run --all-files
+
+    - name: Run (unit) tests
       env:
         SKIP_LIGHTGBM: ${{ matrix.SKIP_LIGHTGBM }}
       run: |
         pytest --cov=probatus/binning --cov=probatus/metric_volatility --cov=probatus/missing_values --cov=probatus/sample_similarity --cov=probatus/stat_tests --cov=probatus/utils --cov=probatus/interpret/ --ignore==tests/interpret/test_inspector.py --cov-report=xml
         pyflakes probatus
-        mypy probatus --ignore-missing-imports
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -2,7 +2,6 @@ name: Development
 on:
   # Trigger the workflow on push or pull request,
   # but only for the main branch
-
   push:
     branches:
       - main
@@ -24,7 +23,7 @@ jobs:
           - build: windows
             os: windows-latest
             SKIP_LIGHTGBM: False
-        python-version: [3.8, 3.9, "3.10"]
+        python-version: [3.8, 3.9, "3.10", "3.11"]
     steps:
     - uses: actions/checkout@master
 
@@ -46,13 +45,19 @@ jobs:
         pip3 install --upgrade setuptools pip
         pip3 install ".[all]"
 
-    - name: Run unit tests and linters
+    - name: Run linters
+      env:
+        SKIP_LIGHTGBM: ${{ matrix.SKIP_LIGHTGBM }}
+      run: |
+        pre-commit install
+        pre-commit run --all-files
+
+    - name: Run (unit) tests
       env:
         SKIP_LIGHTGBM: ${{ matrix.SKIP_LIGHTGBM }}
       run: |
         pytest --cov=probatus/binning --cov=probatus/metric_volatility --cov=probatus/missing_values --cov=probatus/sample_similarity --cov=probatus/stat_tests --cov=probatus/utils --cov=probatus/interpret/ --ignore==tests/interpret/test_inspector.py --cov-report=xml
         pyflakes probatus
-        mypy probatus --ignore-missing-imports
 
     - name: Upload coverage to Codecov
       if: github.ref == 'refs/heads/main'

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,29 +1,32 @@
+files: ^probatus/|^tests/
 repos:
-  - repo: local
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
     hooks:
-    - id: black
-      name: black
-      entry: black
-      language: python
-      types: [python]
-      language_version: python3
-      args: [--line-length=120]
+    - id: check-case-conflict # Different OSes
+      name: 'Check case conflict: Naming of files is compatible with all OSes'
+    - id: check-docstring-first
+      name: 'Check docstring first: Ensures Docstring present and first'
+    - id: detect-private-key
+      name: 'Detect private key: Prevent commit of env related keys'
+    - id: trailing-whitespace
+      name: 'Trailing whitespace: Remove empty spaces'
   - repo: local
     hooks:
     - id: mypy
-      name: mypy
+      name: 'mypy: Static type checking'
       entry: mypy
       language: system
       types: [python]
       args: [--ignore-missing-imports, --namespace-packages, --show-error-codes, --pretty]
   - repo: local
     hooks:
     - id: flake8
-      name: flake8
+      name: 'flake8: Check for errors, styling issues and complexity'
       entry: flake8
       language: system
       types: [python]
-      args: [--max-line-length=120, --docstring-convention=google, "--ignore=D100,D104,D202,D212,D200,E203,E731,W293,D412,D417,W503,D411"]
+      args: [--max-line-length, &line_length "120", --docstring-convention=google, "--ignore=D100,D104,D202,D212,D200,E203,E731,W293,D412,D417,W503,D411"]
 # D100 requires all Python files (modules) to have a "public" docstring even if all functions within have a docstring.
 # D104 requires __init__ files to have a docstring
 # D202 No blank lines allowed after function docstring
@@ -35,4 +38,36 @@ repos:
 # E203
 # E731 do not assign a lambda expression, use a def
 # W293 blank line contains whitespace
-# W503 line break before binary operator (for compatibility with black)
+# W503 line break before binary operator (for compatibility with black)
+  - repo: local
+    hooks:
+    - id: isort
+      name: 'isort: Sort file imports'
+      entry: isort
+      language: system
+      types: [python]
+      args: ["--skip", "__init__.py", "--filter-files", "--profile=black", "--line-length", *line_length]
+  - repo: local
+    hooks:
+    - id: codespell
+      name: 'codespell: Check for grammar'
+      entry: codespell
+      language: system
+      types: [python]
+      # Skip the word "mot"
+      args: [-L mot, --skip="**.egg-info*"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.4.0
+    hooks:
+    - id: pyupgrade
+      name: 'pyupgrade: Updates code to Python 3.8+ code convention'
+      args: ["--py37-plus"]
+  - repo: local
+    hooks:
+    - id: black
+      name: 'black: PEP8 compliant code formatter'
+      entry: black
+      language: python
+      types: [python]
+      language_version: python3
+      args: [--line-length, *line_length]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [2.0.0] - 2023-06
+Improvements in this release:
+- Drop explicit support for python 3.7, add support for 3.11 #206, #203, #185
+- Activate and add pre-commit hooks (isort, codespell) #205, #206
+- Add support for groups in SHAP RFECV #182
+- Bug fix: SHAP RFECV now produces reproducible results every time (this breaks backwards compatibility) #197
+- Bug fix: Updated GitHub actions, fixed deprecations #199
+- Bug fix: Remove most of the unreliable warning assertion checks #207
+
 ## [1.8.9] - 2022-04-08
 Improvements in this release:
 - Drop explicit support for python 3.6, add 3.10 #177

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,14 +1,14 @@
 # Contributing guide
 
-`probatus` aims to provide a set of tools that can speed up common workflows around validating binary classifiers and the data used to train them.
+`Probatus` aims to provide a set of tools that can speed up common workflows around validating binary classifiers and the data used to train them.
 We're very much open to contributions but there are some things to keep in mind:
 
 - Discuss the feature and implementation you want to add on Github before you write a PR for it. On disagreements, maintainer(s) will have the final word.
-- Features need a somewhat general usecase. If the usecase is very niche it will be hard for us to consider maintaining it.
+- Features need a somewhat general use case. If the use case is very niche it will be hard for us to consider maintaining it.
 - If you’re going to add a feature, consider if you could help out in the maintenance of it.
 - When issues or pull requests are not going to be resolved or merged, they should be closed as soon as possible. This is kinder than deciding this after a long period. Our issue tracker should reflect work to be done.
 
-That said, there are many ways to contribute to probatus, including:
+That said, there are many ways to contribute to Probatus, including:
 
 - Contribution to code
 - Improving the documentation
@@ -38,9 +38,17 @@ We use [pre-commit](https://pre-commit.com/) hooks to ensure code styling. Insta
 pre-commit install
 ```
 
+Now if you install it (which you are encouraged to do), you are encouraged to do the following command before committing your work:
+
+```shell
+pre-commit run --all-files
+```
+
+This will allow you to quickly see if the work you made contains some adaptions that you still might need to make before a pull request is accepted.
+
 ## Standards
 
-- Python 3.6+
+- Python 3.8+
 - Follow [PEP8](http://pep8.org/) as closely as possible (except line length)
 - [google docstring format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/)
 - Git: Include a short description of *what* and *why* was done, *how* can be seen in the code. Use present tense, imperative mood
@@ -51,14 +59,13 @@ pre-commit install
 
 * Model validation modules assume that trained models passed for validation are developed in a scikit-learn framework (i.e. have predict_proba and other standard functions), or follow a scikit-learn API e.g. XGBoost.
 * Every python file used for model validation needs to be in `/probatus/`
-* Class structure for a given module should have a base class and specific functionality classes that inherit from base. If a given module implements only a single way of computing the output, the base class is not required. 
-* Functions should not be as short as possible in terms of lines of code. If a lot of code is needed, try to put together snippets of code into 
-other functions. This make the code more readable, and easier to test.
+* Class structure for a given module should have a base class and specific functionality classes that inherit from base. If a given module implements only a single way of computing the output, the base class is not required.
+* Functions should not be as short as possible in terms of lines of code. If a lot of code is needed, try to put together snippets of code into other functions. This make the code more readable, and easier to test.
 * Classes follow the probatus API structure:
     * Each class implements `fit()`, `compute()` and `fit_compute()` methods. `fit()` is used to fit an object with provided data (unless no fit is required), and `compute()` calculates the output e.g. DataFrame with a report for the user. Lastly, `fit_compute()` applies one after the other.
     * If applicable, the `plot()` method presents the user with the appropriate graphs.
     * For `compute()` and `plot()`, check if the object is fitted first.
-        
+
 
 ### Documentation
 

diff --git a/README.md b/README.md
@@ -6,13 +6,13 @@
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/probatus)](#)
 ![GitHub contributors](https://img.shields.io/github/contributors/ing-bank/probatus)
 
-# probatus
+# Probatus
 
 ## Overview
 
 **Probatus** is a python package that helps validate binary classification models and the data used to develop them. Main features:
 
-- [probatus.interpret](https://ing-bank.github.io/probatus/api/model_interpret.html) provides shap-based model interpretation tools 
+- [probatus.interpret](https://ing-bank.github.io/probatus/api/model_interpret.html) provides shap-based model interpretation tools
 - [probatus.metric_volatility](https://ing-bank.github.io/probatus/api/metric_volatility.html) provides tools using bootstrapping and/or different random seeds to assess metric volatility/stability.
 - [probatus.sample_similarity](https://ing-bank.github.io/probatus/api/sample_similarity.html) to compare two datasets using resemblance modelling, f.e. `train` with out-of-time `test`.
 - [probatus.feature_elimination.ShapRFECV](https://ing-bank.github.io/probatus/api/feature_elimination.html) provides cross-validated Recursive Feature Elimination using shap feature importance.
@@ -28,11 +28,11 @@ pip install probatus
 
 Documentation at [ing-bank.github.io/probatus/](https://ing-bank.github.io/probatus/).
 
-You can also check out blog posts about Probatus: 
+You can also check out blog posts about Probatus:
 
 -  [Open-sourcing ShapRFECV — Improved feature selection powered by SHAP.](https://medium.com/ing-blog/open-sourcing-shaprfecv-improved-feature-selection-powered-by-shap-994fe7861560)
 -  [Model Explainability — How to choose the right tool?](https://medium.com/ing-blog/model-explainability-how-to-choose-the-right-tool-6c5eabd1a46a)
 
 ## Contributing
 
-To learn more about making a contribution to probatus, please see [`CONTRIBUTING.md`](CONTRIBUTING.md).
+To learn more about making a contribution to Probatus, please see [`CONTRIBUTING.md`](CONTRIBUTING.md).
diff --git a/VISION.md b/VISION.md
@@ -1,32 +1,32 @@
 # The Vision
 
-This page describes the main principles that drive the development of `probatus` as well as the general directions, in which the development of the package will be heading.
+This page describes the main principles that drive the development of `Probatus` as well as the general directions, in which the development of the package will be heading.
 
 ## The Purpose
 
-`probatus` has started as a side project of Data Scientists at ING Bank. 
+`Probatus` has started as a side project of Data Scientists at ING Bank.
 Later, we have decided to open-source it, in order to share the tools and enable collaboration with the Data Science community.
 
-We mainly focus on analysing the following aspects of building classification models:
+We mainly focus on analyzing the following aspects of building classification models:
 - Model input: the quality of the dataset and how to prepare it for modelling,
-- Model performance: the quality of the model and stability of the results. 
+- Model performance: the quality of the model and stability of the results.
 - Model interpretation: understanding the model decision making,
 
 Our main goals are:
 - Continue maintaining the tools that we have built, and make sure that they are well documented and tested
 - Continuously extend functionality available in the package
-- Build a community of users, which use the package in day-to-day work and learn from each other, while contributing to probatus
+- Build a community of users, which use the package in day-to-day work and learn from each other, while contributing to Probatus
 
 ## The Principles
 
-The main principles that drive development of `probatus` are the following
+The main principles that drive development of `Probatus` are the following
 
 - Usefulness - any tool that we build should be useful for a broad range of users,
-- Simplicity - simple to understand and analyse steps over state-of-the-art,
+- Simplicity - simple to understand and analyze steps over state-of-the-art,
 - Usability - the developed functionality must be have good documentation, consistent API and work flawlessly with scikit-learn compatible models,
 - Reliability - the code that is available for the users should be well tested and reliable, and bugs should be fixed as soon as they are detected.
 
 ## The Roadmap
 
-The following [issue](https://github.com/ing-bank/probatus/issues/93) keeps track of the features coming to probatus.
-We are open to new ideas, so if you can think of a feature that fits the vision, make an [issue](https://github.com/ing-bank/probatus/issues) and help us further develop this package.
+The following [issue](https://github.com/ing-bank/Probatus/issues/93) keeps track of the features coming to Probatus.
+We are open to new ideas, so if you can think of a feature that fits the vision, make an [issue](https://github.com/ing-bank/Probatus/issues) and help us further develop this package.
diff --git a/docs/img/imputation_comparision.png → docs/img/imputation_comparison.png b/docs/img/imputation_comparision.png → docs/img/imputation_comparison.png
diff --git a/probatus/binning/binning.py b/probatus/binning/binning.py
@@ -18,23 +18,21 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
-import pandas as pd
+import warnings
+from abc import abstractmethod
+
 import numpy as np
+import pandas as pd
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.tree import DecisionTreeClassifier, _tree
 from sklearn.utils.validation import check_is_fitted
-from probatus.utils import (
-    assure_numpy_array,
-    ApproximationWarning,
-    BaseFitComputeClass,
-)
-import warnings
-from abc import abstractmethod
+
+from probatus.utils import ApproximationWarning, BaseFitComputeClass, assure_numpy_array
 
 
 class Bucketer(BaseFitComputeClass):
     """
-    Bucket (bin) some datea.
+    Bucket (bin) some data.
     """
 
     def __repr__(self):
@@ -286,7 +284,7 @@ def quantile_bins(x, bin_count, inf_edges=True):
         try:
             out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates="raise")
         except ValueError:
-            # If there are too many duplicate values (assume a lot of filled missings)
+            # If there are too many duplicate values (assume a lot of filled missing)
             # this crashes - the exception drops them.
             # This means that it will return approximate quantile bins
             out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates="drop")
@@ -350,13 +348,13 @@ class TreeBucketer(Bucketer):
         If false, the edges will be set to the minimum and maximum value of the fitted
 
         tree (sklearn.tree.DecisionTreeClassifier): decision tree object defined by the user. By default is None, and
-        it will be constructed using tkhe provided **kwargs
+        it will be constructed using the provided **kwargs
 
         **tree_kwargs: kwargs related to the decision tree.
-            For and extensive list of parameteres, please check the sklearn Decision Tree Classifier documentation
+            For and extensive list of parameters, please check the sklearn Decision Tree Classifier documentation
             https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
 
-            The most relevant parameteres useful for the bucketing, are listed below:
+            The most relevant parameters useful for the bucketing, are listed below:
 
 
                 - criterion : {"gini", "entropy"}, default="gini"