diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..de2bc85 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include *.rst +recursive-include sklift/datasets/ *.rst +include MANIFEST.in \ No newline at end of file diff --git a/Readme.rst b/Readme.rst index e53623d..aab5992 100644 --- a/Readme.rst +++ b/Readme.rst @@ -9,7 +9,7 @@ .. _PyPi: https://badge.fury.io/py/scikit-uplift .. |Docs| image:: https://readthedocs.org/projects/scikit-uplift/badge/?version=latest -.. _Docs: https://scikit-uplift.readthedocs.io/en/latest/ +.. _Docs: https://www.uplift-modeling.com/en/latest/ .. |License| image:: https://img.shields.io/badge/license-MIT-green .. _License: https://github.com/maks-sh/scikit-uplift/blob/master/LICENSE @@ -26,7 +26,7 @@ .. |Open In Colab4| image:: https://colab.research.google.com/assets/colab-badge.svg .. _Open In Colab4: https://colab.research.google.com/github/maks-sh/scikit-uplift/blob/master/notebooks/pipeline_usage_RU.ipynb -.. _scikit-uplift.readthedocs.io: https://scikit-uplift.readthedocs.io/en/latest/ +.. _uplift-modeling.com: https://www.uplift-modeling.com/en/latest/ .. image:: https://raw.githubusercontent.com/maks-sh/scikit-uplift/dev/docs/_static/sklift-github-logo.png :align: center @@ -48,7 +48,7 @@ Uplift modeling estimates a causal effect of treatment and uses it to effectivel * Select a tiny group of customers in the campaign where a price per customer is high. -Read more about uplift modeling problem in `User Guide `__, +Read more about uplift modeling problem in `User Guide `__. Articles in russian on habr.com: `Part 1 `__ and `Part 2 `__. @@ -87,7 +87,7 @@ Or install from source: Documentation -------------- -The full documentation is available at `scikit-uplift.readthedocs.io`_. +The full documentation is available at `uplift-modeling.com`_. Or you can build the documentation locally using `Sphinx `_ 1.4 or later: @@ -106,6 +106,8 @@ See the **RetailHero tutorial notebook** (`EN `__. + .. code-block:: python # import approaches @@ -130,6 +132,8 @@ See the **RetailHero tutorial notebook** (`EN `__. + .. code-block:: python # import metrics to evaluate your model @@ -153,6 +157,8 @@ See the **RetailHero tutorial notebook** (`EN `__. + .. code-block:: python # import vizualisation tools @@ -170,55 +176,20 @@ Development We welcome new contributors of all experience levels. -- Please see our `Contributing Guide `_ for more details. +- Please see our `Contributing Guide `_ for more details. - By participating in this project, you agree to abide by its `Code of Conduct `__. If you have any questions, please contact us at team@uplift-modeling.com -Contributing -~~~~~~~~~~~~~~~ - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/0 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/0 - :alt: Top contributor 1 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/1 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/1 - :alt: Top contributor 2 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/2 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/2 - :alt: Top contributor 3 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/3 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/3 - :alt: Top contributor 4 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/4 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/4 - :alt: Top contributor 5 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/5 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/5 - :alt: Top contributor 6 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/6 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/6 - :alt: Top contributor 7 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/7 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/7 - :alt: Legend - Important links ~~~~~~~~~~~~~~~ - Official source code repo: https://github.com/maks-sh/scikit-uplift/ - Issue tracker: https://github.com/maks-sh/scikit-uplift/issues -- Documentation: https://scikit-uplift.readthedocs.io/en/latest/ -- User Guide: https://scikit-uplift.readthedocs.io/en/latest/user_guide/index.html -- Contributing guide: https://scikit-uplift.readthedocs.io/en/latest/contributing.html -- Release History: https://scikit-uplift.readthedocs.io/en/latest/changelog.html +- Documentation: https://www.uplift-modeling.com/en/latest/ +- User Guide: https://www.uplift-modeling.com/en/latest/user_guide/index.html +- Contributing guide: https://www.uplift-modeling.com/en/latest/contributing.html +- Release History: https://www.uplift-modeling.com/en/latest/changelog.html =============== diff --git a/docs/changelog.md b/docs/changelog.md index 4eee210..5d9a6a8 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,90 +8,104 @@ * πŸ”¨ something that previously didn’t work as documentated – or according to reasonable expectations – should now work. * ❗️ you will need to change your code to have the same effect in the future; or a feature will be removed in the future. +## Version 0.3.1 + +### [sklift.datasets](https://www.uplift-modeling.com/en/v0.3.1/api/datasets/index.html) + +* πŸ”¨ Fix bugs in [sklift.datasets](https://www.uplift-modeling.com/en/v0.3.1/api/datasets/index.html) + +### [sklift.metrics](https://www.uplift-modeling.com/en/v0.3.1/api/index/metrics.html) + +* πŸ“ Imporve [uplift_by_percentile](https://www.uplift-modeling.com/en/v0.3.1/api/metrics/uplift_by_percentile.html) function by [@ElisovaIra](https://github.com/ElisovaIra). + +### Miscellaneous + +* πŸ’₯ Add tutorial ["Uplift modeling metrics"](https://nbviewer.jupyter.org/github/maks-sh/scikit-uplift/blob/master/notebooks/uplift_metrics_tutorial.ipynb) by [@ElisovaIra](https://github.com/ElisovaIra). + ## Version 0.3.0 -### [sklift.datasets](https://www.uplift-modeling.com/en/latest/en/latest/api/datasets/index.html) +### [sklift.datasets](https://www.uplift-modeling.com/en/v0.3.0/api/datasets/index.html) -* πŸ”₯ Add [sklift.datasets](https://www.uplift-modeling.com/en/latest/en/latest/user_guide/index.html) by [@ElisovaIra](https://github.com/ElisovaIra), [@RobbStarkk](https://github.com/RobbStarkk), [@acssar](https://github.com/acssar), [@tankudo](https://github.com/tankudo), [@flashlight101](https://github.com/flashlight101), [@semenova-pd](https://github.com/semenova-pd), [@timfex](https://github.com/timfex) +* πŸ”₯ Add [sklift.datasets](https://www.uplift-modeling.com/en/v0.3.0/api/datasets/index.html) by [@ElisovaIra](https://github.com/ElisovaIra), [@RobbStarkk](https://github.com/RobbStarkk), [@acssar](https://github.com/acssar), [@tankudo](https://github.com/tankudo), [@flashlight101](https://github.com/flashlight101), [@semenova-pd](https://github.com/semenova-pd), [@timfex](https://github.com/timfex) -### [sklift.models](https://www.uplift-modeling.com/en/latest/en/latest/api/models.html) +### [sklift.models](https://www.uplift-modeling.com/en/v0.3.0/api/models/index.html) * πŸ“ Add different checkers by [@ElisovaIra](https://github.com/ElisovaIra) -### [sklift.metrics](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics.html) +### [sklift.metrics](https://www.uplift-modeling.com/en/v0.3.0/api/metrics/index.html) * πŸ“ Add different checkers by [@ElisovaIra](https://github.com/ElisovaIra) -### [sklift.viz](https://www.uplift-modeling.com/en/latest/en/latest/api/viz.html) +### [sklift.viz](https://www.uplift-modeling.com/en/v0.3.0/api/viz/index.html) * πŸ“ Fix conflicting and duplicating default values by [@denniskorablev](https://github.com/denniskorablev) -### [User Guide](https://www.uplift-modeling.com/en/latest/en/latest/user_guide/index.html) +### [User Guide](https://www.uplift-modeling.com/en/v0.3.0/user_guide/index.html) * πŸ“ Fix typos ## Version 0.2.0 -### [User Guide](https://www.uplift-modeling.com/en/latest/en/latest/user_guide/index.html) +### [User Guide](https://www.uplift-modeling.com/en/v0.2.0/user_guide/index.html) -* πŸ”₯ Add [User Guide](https://www.uplift-modeling.com/en/latest/en/latest/user_guide/index.html) +* πŸ”₯ Add [User Guide](https://www.uplift-modeling.com/en/v0.2.0/user_guide/index.html) -### [sklift.models](https://www.uplift-modeling.com/en/latest/en/latest/api/models.html) +### [sklift.models](https://www.uplift-modeling.com/en/v0.2.0/api/models/index.html) -* πŸ’₯ Add `treatment interaction` method to [SoloModel](https://www.uplift-modeling.com/en/latest/en/latest/api/models/SoloModel.html) approach by [@AdiVarma27](https://github.com/AdiVarma27). +* πŸ’₯ Add `treatment interaction` method to [SoloModel](https://www.uplift-modeling.com/en/v0.2.0/api/models/SoloModel.html) approach by [@AdiVarma27](https://github.com/AdiVarma27). -### [sklift.metrics](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics.html) +### [sklift.metrics](https://www.uplift-modeling.com/en/v0.2.0/api/index/metrics.html) -* πŸ’₯ Add [uplift_by_percentile](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/uplift_by_percentile.html) function by [@ElisovaIra](https://github.com/ElisovaIra). -* πŸ’₯ Add [weighted_average_uplift](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/weighted_average_uplift.html) function by [@ElisovaIra](https://github.com/ElisovaIra). -* πŸ’₯ Add [perfect_uplift_curve](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/perfect_uplift_curve.html) function. -* πŸ’₯ Add [perfect_qini_curve](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/perfect_qini_curve.html) function. -* πŸ”¨ Add normalization in [uplift_auc_score](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/uplift_auc_score.html) and [qini_auc_score](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/qini_auc_score.html) functions. -* ❗ Remove metrics `auuc` and `auqc`. In exchange for them use respectively [uplift_auc_score](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/uplift_auc_score.html) and [qini_auc_score](https://www.uplift-modeling.com/en/latest/en/latest/api/metrics/qini_auc_score.html) +* πŸ’₯ Add [uplift_by_percentile](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/uplift_by_percentile.html) function by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ’₯ Add [weighted_average_uplift](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/weighted_average_uplift.html) function by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ’₯ Add [perfect_uplift_curve](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/perfect_uplift_curve.html) function. +* πŸ’₯ Add [perfect_qini_curve](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/perfect_qini_curve.html) function. +* πŸ”¨ Add normalization in [uplift_auc_score](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/uplift_auc_score.html) and [qini_auc_score](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/qini_auc_score.html) functions. +* ❗ Remove metrics `auuc` and `auqc`. In exchange for them use respectively [uplift_auc_score](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/uplift_auc_score.html) and [qini_auc_score](https://www.uplift-modeling.com/en/v0.2.0/api/metrics/qini_auc_score.html) -### [sklift.viz](https://www.uplift-modeling.com/en/latest/en/latest/api/viz.html) +### [sklift.viz](https://www.uplift-modeling.com/en/v0.2.0/api/viz/index.html) -* πŸ’₯ Add [plot_uplift_curve](https://www.uplift-modeling.com/en/latest/en/latest/api/viz/plot_uplift_curve.html) function. -* πŸ’₯ Add [plot_qini_curve](https://www.uplift-modeling.com/en/latest/en/latest/api/viz/plot_qini_curve.html) function. +* πŸ’₯ Add [plot_uplift_curve](https://www.uplift-modeling.com/en/v0.2.0/api/viz/plot_uplift_curve.html) function. +* πŸ’₯ Add [plot_qini_curve](https://www.uplift-modeling.com/en/v0.2.0/api/viz/plot_qini_curve.html) function. * ❗ Remove `plot_uplift_qini_curves`. ### Miscellaneous * πŸ’₯ Add contributors in main Readme and in main page of docs. -* πŸ’₯ Add [contributing guide](https://www.uplift-modeling.com/en/latest/en/latest/contributing.html). +* πŸ’₯ Add [contributing guide](https://www.uplift-modeling.com/en/v0.2.0/contributing.html). * πŸ’₯ Add [code of conduct](https://github.com/maks-sh/scikit-uplift/blob/master/.github/CODE_OF_CONDUCT.md). -* πŸ“ Reformat [Tutorials](https://www.uplift-modeling.com/en/latest/en/latest/tutorials.html) page. +* πŸ“ Reformat [Tutorials](https://www.uplift-modeling.com/en/v0.2.0/tutorials.html) page. * πŸ“ Add github buttons in docs. * πŸ“ Add logo compatibility with pypi. ## Version 0.1.2 -### [sklift.models](https://www.uplift-modeling.com/en/latest/en/v0.1.2/api/models.html) +### [sklift.models](https://www.uplift-modeling.com/en/v0.1.2/api/models.html) -* πŸ”¨ Fix bugs in [TwoModels](https://www.uplift-modeling.com/en/latest/en/v0.1.2/api/models.html#sklift.models.models.TwoModels) for regression problem. +* πŸ”¨ Fix bugs in [TwoModels](https://www.uplift-modeling.com/en/v0.1.2/api/models.html#sklift.models.models.TwoModels) for regression problem. * πŸ“ Minor code refactoring. -### [sklift.metrics](https://www.uplift-modeling.com/en/latest/en/v0.1.2/api/metrics.html) +### [sklift.metrics](https://www.uplift-modeling.com/en/v0.1.2/api/metrics.html) * πŸ“ Minor code refactoring. -### [sklift.viz](https://www.uplift-modeling.com/en/latest/en/v0.1.2/api/viz.html) +### [sklift.viz](https://www.uplift-modeling.com/en/v0.1.2/api/viz.html) -* πŸ’₯ Add bar plot in [plot_uplift_by_percentile](https://www.uplift-modeling.com/en/latest/en/v0.1.2/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). -* πŸ”¨ Fix bug in [plot_uplift_by_percentile](https://www.uplift-modeling.com/en/latest/en/v0.1.2/api/viz.html#sklift.viz.base.plot_uplift_by_percentile). +* πŸ’₯ Add bar plot in [plot_uplift_by_percentile](https://www.uplift-modeling.com/en/v0.1.2/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ”¨ Fix bug in [plot_uplift_by_percentile](https://www.uplift-modeling.com/en/v0.1.2/api/viz.html#sklift.viz.base.plot_uplift_by_percentile). * πŸ“ Minor code refactoring. ## Version 0.1.1 -### [sklift.viz](https://www.uplift-modeling.com/en/latest/en/v0.1.1/api/viz.html) +### [sklift.viz](https://www.uplift-modeling.com/en/v0.1.1/api/viz.html) -* πŸ’₯ Add [plot_uplift_by_percentile](https://www.uplift-modeling.com/en/latest/en/v0.1.1/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). -* πŸ”¨ Fix bug with import [plot_treatment_balance_curve](https://www.uplift-modeling.com/en/latest/en/v0.1.1/api/viz.html#sklift.viz.base.plot_treatment_balance_curve). +* πŸ’₯ Add [plot_uplift_by_percentile](https://www.uplift-modeling.com/en/v0.1.1/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ”¨ Fix bug with import [plot_treatment_balance_curve](https://www.uplift-modeling.com/en/v0.1.1/api/viz.html#sklift.viz.base.plot_treatment_balance_curve). -### [sklift.metrics](https://www.uplift-modeling.com/en/latest/en/v0.1.1/api/metrics.html) +### [sklift.metrics](https://www.uplift-modeling.com/en/v0.1.1/api/metrics.html) -* πŸ’₯ Add [response_rate_by_percentile](https://www.uplift-modeling.com/en/latest/en/v0.1.1/api/viz.html#sklift.metrics.metrics.response_rate_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). -* πŸ”¨ Fix bug with import [uplift_auc_score](https://www.uplift-modeling.com/en/latest/en/v0.1.1/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://www.uplift-modeling.com/en/latest/en/v0.1.1/metrics.html#sklift.metrics.metrics.qini_auc_score). +* πŸ’₯ Add [response_rate_by_percentile](https://www.uplift-modeling.com/en/v0.1.1/api/viz.html#sklift.metrics.metrics.response_rate_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). +* πŸ”¨ Fix bug with import [uplift_auc_score](https://www.uplift-modeling.com/en/v0.1.1/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://www.uplift-modeling.com/en/v0.1.1/metrics.html#sklift.metrics.metrics.qini_auc_score). * πŸ“ Fix typos in docstrings. ### Miscellaneous @@ -101,25 +115,25 @@ ## Version 0.1.0 -### [sklift.models](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/models.html) +### [sklift.models](https://www.uplift-modeling.com/en/v0.1.0/api/models.html) -* πŸ“ Fix typo in [TwoModels](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/models.html#sklift.models.models.TwoModels) docstring by [@spiaz](https://github.com/spiaz). +* πŸ“ Fix typo in [TwoModels](https://www.uplift-modeling.com/en/v0.1.0/api/models.html#sklift.models.models.TwoModels) docstring by [@spiaz](https://github.com/spiaz). * πŸ“ Improve docstrings and add references to all approaches. -### [sklift.metrics](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/metrics.html) +### [sklift.metrics](https://www.uplift-modeling.com/en/v0.1.0/api/metrics.html) -* πŸ’₯ Add [treatment_balance_curve](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/metrics.html#sklift.metrics.metrics.treatment_balance_curve) by [@spiaz](https://github.com/spiaz). -* ❗️ The metrics `auuc` and `auqc` are now respectively renamed to [uplift_auc_score](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://www.uplift-modeling.com/en/latest/en/v0.1.0/metrics.html#sklift.metrics.metrics.qini_auc_score). So, `auuc` and `auqc` will be removed in 0.2.0. -* ❗️ Add a new parameter `startegy` in [uplift_at_k](https://www.uplift-modeling.com/en/latest/en/v0.1.0/metrics.html#sklift.metrics.metrics.uplift_at_k). +* πŸ’₯ Add [treatment_balance_curve](https://www.uplift-modeling.com/en/v0.1.0/api/metrics.html#sklift.metrics.metrics.treatment_balance_curve) by [@spiaz](https://github.com/spiaz). +* ❗️ The metrics `auuc` and `auqc` are now respectively renamed to [uplift_auc_score](https://www.uplift-modeling.com/en/v0.1.0/api/metrics.html#sklift.metrics.metrics.uplift_auc_score) and [qini_auc_score](https://www.uplift-modeling.com/en/v0.1.0/metrics.html#sklift.metrics.metrics.qini_auc_score). So, `auuc` and `auqc` will be removed in 0.2.0. +* ❗️ Add a new parameter `startegy` in [uplift_at_k](https://www.uplift-modeling.com/en/v0.1.0/metrics.html#sklift.metrics.metrics.uplift_at_k). -### [sklift.viz](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/viz.html) +### [sklift.viz](https://www.uplift-modeling.com/en/v0.1.0/api/viz.html) -* πŸ’₯ Add [plot_treatment_balance_curve](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/viz.html#sklift.viz.base.plot_treatment_balance_curve) by [@spiaz](https://github.com/spiaz). -* πŸ“ fix typo in [plot_uplift_qini_curves](https://www.uplift-modeling.com/en/latest/en/v0.1.0/api/viz.html#sklift.viz.base.plot_uplift_qini_curves) by [@spiaz](https://github.com/spiaz). +* πŸ’₯ Add [plot_treatment_balance_curve](https://www.uplift-modeling.com/en/v0.1.0/api/viz.html#sklift.viz.base.plot_treatment_balance_curve) by [@spiaz](https://github.com/spiaz). +* πŸ“ fix typo in [plot_uplift_qini_curves](https://www.uplift-modeling.com/en/v0.1.0/api/viz.html#sklift.viz.base.plot_uplift_qini_curves) by [@spiaz](https://github.com/spiaz). ### Miscellaneous * ❗️ Remove sklift.preprocess submodule. * πŸ’₯ Add compatibility of tutorials with colab and add colab buttons by [@ElMaxuno](https://github.com/ElMaxuno). * πŸ’₯ Add Changelog. -* πŸ“ Change the documentation structure. Add next pages: [Tutorials](https://www.uplift-modeling.com/en/latest/en/v0.1.0/tutorials.html), [Release History](https://www.uplift-modeling.com/en/latest/en/v0.1.0/changelog.html) and [Hall of fame](https://www.uplift-modeling.com/en/latest/en/v0.1.0/hall_of_fame.html). \ No newline at end of file +* πŸ“ Change the documentation structure. Add next pages: [Tutorials](https://www.uplift-modeling.com/en/v0.1.0/tutorials.html), [Release History](https://www.uplift-modeling.com/en/v0.1.0/changelog.html) and [Hall of fame](https://www.uplift-modeling.com/en/v0.1.0/hall_of_fame.html). \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index dfe04f6..e728b1f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,7 +22,7 @@ The main idea is to provide easy-to-use and fast python package for uplift model * Select a tiny group of customers in the campaign where a price per customer is high. -Read more about *uplift modeling* problem in `User Guide `__, +Read more about *uplift modeling* problem in `User Guide `__, Articles in russian on habr.com: `Part 1 `__ and `Part 2 `__. @@ -75,38 +75,6 @@ Sklift is being actively maintained and welcomes new contributors of all experie If you have any questions, please contact us at team@uplift-modeling.com -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/0 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/0 - :alt: Top contributor 1 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/1 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/1 - :alt: Top contributor 2 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/2 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/2 - :alt: Top contributor 3 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/3 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/3 - :alt: Top contributor 4 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/4 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/4 - :alt: Top contributor 5 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/5 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/5 - :alt: Top contributor 6 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/6 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/6 - :alt: Top contributor 7 - -.. image:: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/images/7 - :target: https://sourcerer.io/fame/maks-sh/maks-sh/scikit-uplift/links/7 - :alt: Legend - .. toctree:: :hidden: diff --git a/docs/quick_start.rst b/docs/quick_start.rst index 92794e2..77c70ca 100644 --- a/docs/quick_start.rst +++ b/docs/quick_start.rst @@ -16,7 +16,7 @@ See the **RetailHero tutorial notebook** (`EN`_ |Open In Colab1|_, `RU`_ |Open I Train and predict your uplift model ==================================== -Use the intuitive python API to train uplift models. +Use the intuitive python API to train uplift models with `sklift.models `__. .. code-block:: python :linenos: @@ -44,6 +44,8 @@ Use the intuitive python API to train uplift models. Evaluate your uplift model =========================== +Uplift model evaluation metrics are available in `sklift.metrics `__. + .. code-block:: python :linenos: @@ -73,6 +75,8 @@ Evaluate your uplift model Vizualize the results ====================== +Visualize performance metrics with `sklift.viz `__. + .. code-block:: python :linenos: diff --git a/docs/tutorials.rst b/docs/tutorials.rst index adbd775..61ee43e 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -35,6 +35,19 @@ It is better to start scikit-uplift from the basic tutorials. - `nbviewer `__ - `github `__ +`Uplift modeling metrics`_ +---------------------------------------------------------------------------------- + +.. list-table:: + :align: center + :widths: 12 15 10 8 + + * - In English πŸ‡¬πŸ‡§ + - |Open In Colab1| + - `nbviewer `__ + - `github `__ + + `Example of usage model from sklift.models in sklearn.pipeline`_ ---------------------------------------------------------------------------------- diff --git a/notebooks/Readme.rst b/notebooks/Readme.rst index adbd775..b2090ab 100644 --- a/notebooks/Readme.rst +++ b/notebooks/Readme.rst @@ -35,6 +35,18 @@ It is better to start scikit-uplift from the basic tutorials. - `nbviewer `__ - `github `__ +`Uplift modeling metrics`_ +---------------------------------------------------------------------------------- + +.. list-table:: + :align: center + :widths: 12 15 10 8 + + * - In English πŸ‡¬πŸ‡§ + - |Open In Colab1| + - `nbviewer `__ + - `github `__ + `Example of usage model from sklift.models in sklearn.pipeline`_ ---------------------------------------------------------------------------------- diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb index 16892f5..01552cf 100644 --- a/notebooks/pipeline_usage_RU.ipynb +++ b/notebooks/pipeline_usage_RU.ipynb @@ -51,7 +51,7 @@ }, "outputs": [], "source": [ - "# !pip install scikit-uplift xgboost==1.0.2 category_encoders==2.1.0 -U" + "# pip install scikit-uplift xgboost==1.0.2 category_encoders==2.1.0 -U" ] }, { @@ -395,8 +395,17 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/notebooks/uplift_metrics_tutorial.ipynb b/notebooks/uplift_metrics_tutorial.ipynb new file mode 100644 index 0000000..3b78ef2 --- /dev/null +++ b/notebooks/uplift_metrics_tutorial.ipynb @@ -0,0 +1,1536 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🎯 Uplift modeling `metrics`\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " SCIKIT-UPLIFT REPO | \n", + " SCIKIT-UPLIFT DOCS | \n", + " USER GUIDE\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:26.539268Z", + "start_time": "2021-02-19T20:20:26.526760Z" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# install uplift library scikit-uplift and other libraries \n", + "!{sys.executable} -m pip install scikit-uplift dill catboost" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# πŸ“ Load data\n", + "\n", + "We are going to use a `Lenta dataset` from the BigTarget Hackathon hosted in summer 2020 by Lenta and Microsoft.\n", + "\n", + "Lenta is a russian food retailer. \n", + "\n", + "### Data description\n", + "\n", + "✏️ Dataset can be loaded from `sklift.datasets` module using `fetch_lenta` function.\n", + "\n", + "Read more about dataset in the api docs. \n", + "\n", + "This is an uplift modeling dataset containing data about Lenta's customers grociery shopping, marketing campaigns communications as `treatment` and store visits as `target`.\n", + "\n", + "#### ✏️ Major columns:\n", + "\n", + "- `group` - treatment / control flag\n", + "- `response_att` - binary target\n", + "- `CardHolder` - customer id\n", + "- `gender` - customer gender \n", + "- `age` - customer age" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:53.041410Z", + "start_time": "2021-02-19T20:20:31.218625Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklift.datasets import fetch_lenta\n", + "\n", + "# returns sklearn Bunch object\n", + "# with data, target, treatment keys\n", + "# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values \n", + "dataset = fetch_lenta()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:53.086524Z", + "start_time": "2021-02-19T20:20:53.044019Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset type: \n", + "\n", + "Dataset features shape: (687029, 193)\n", + "Dataset target shape: (687029,)\n", + "Dataset treatment shape: (687029,)\n" + ] + } + ], + "source": [ + "print(f\"Dataset type: {type(dataset)}\\n\")\n", + "print(f\"Dataset features shape: {dataset.data.shape}\")\n", + "print(f\"Dataset target shape: {dataset.target.shape}\")\n", + "print(f\"Dataset treatment shape: {dataset.treatment.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# πŸ“ EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:53.181597Z", + "start_time": "2021-02-19T20:20:53.100485Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agecheque_count_12m_g20cheque_count_12m_g21cheque_count_12m_g25cheque_count_12m_g32cheque_count_12m_g33cheque_count_12m_g38cheque_count_12m_g39cheque_count_12m_g41cheque_count_12m_g42...sale_sum_6m_g24sale_sum_6m_g25sale_sum_6m_g26sale_sum_6m_g32sale_sum_6m_g33sale_sum_6m_g44sale_sum_6m_g54stdev_days_between_visits_15dstdev_discount_depth_15dstdev_discount_depth_1m
047.03.022.019.03.028.08.07.06.01.0...3141.25356.67237.25283.843648.231195.37535.421.70780.27980.3008
157.01.00.02.01.01.01.00.01.00.0...113.3962.6958.7187.01179.830.00122.980.00000.00000.0000
238.07.00.015.04.09.05.09.014.07.0...1239.19533.4683.37593.131217.431336.833709.820.0000NaN0.0803
365.06.03.025.02.010.014.011.08.01.0...139.681849.91360.40175.73496.73172.581246.210.00000.00000.0000
461.00.01.02.00.02.01.00.03.02.0...226.98168.05461.370.00237.93225.51995.271.41420.34950.3495
68702435.00.00.04.00.02.00.01.00.03.0...550.09669.33111.870.00330.961173.84119.992.64580.36460.3282
68702533.00.00.00.00.00.00.00.00.00.0...0.000.000.000.000.000.0028.010.00000.00000.0000
68702636.00.00.03.00.00.00.00.01.00.0...0.000.000.000.000.00449.010.000.0000NaNNaN
68702737.00.01.02.00.00.00.00.00.01.0...0.0046.720.000.000.000.000.000.0000NaNNaN
68702840.00.01.00.00.02.00.00.02.02.0...290.010.000.000.00228.47752.32596.860.00000.00000.0000
\n", + "

10 rows Γ— 193 columns

\n", + "
" + ], + "text/plain": [ + " age cheque_count_12m_g20 cheque_count_12m_g21 \\\n", + "0 47.0 3.0 22.0 \n", + "1 57.0 1.0 0.0 \n", + "2 38.0 7.0 0.0 \n", + "3 65.0 6.0 3.0 \n", + "4 61.0 0.0 1.0 \n", + "687024 35.0 0.0 0.0 \n", + "687025 33.0 0.0 0.0 \n", + "687026 36.0 0.0 0.0 \n", + "687027 37.0 0.0 1.0 \n", + "687028 40.0 0.0 1.0 \n", + "\n", + " cheque_count_12m_g25 cheque_count_12m_g32 cheque_count_12m_g33 \\\n", + "0 19.0 3.0 28.0 \n", + "1 2.0 1.0 1.0 \n", + "2 15.0 4.0 9.0 \n", + "3 25.0 2.0 10.0 \n", + "4 2.0 0.0 2.0 \n", + "687024 4.0 0.0 2.0 \n", + "687025 0.0 0.0 0.0 \n", + "687026 3.0 0.0 0.0 \n", + "687027 2.0 0.0 0.0 \n", + "687028 0.0 0.0 2.0 \n", + "\n", + " cheque_count_12m_g38 cheque_count_12m_g39 cheque_count_12m_g41 \\\n", + "0 8.0 7.0 6.0 \n", + "1 1.0 0.0 1.0 \n", + "2 5.0 9.0 14.0 \n", + "3 14.0 11.0 8.0 \n", + "4 1.0 0.0 3.0 \n", + "687024 0.0 1.0 0.0 \n", + "687025 0.0 0.0 0.0 \n", + "687026 0.0 0.0 1.0 \n", + "687027 0.0 0.0 0.0 \n", + "687028 0.0 0.0 2.0 \n", + "\n", + " cheque_count_12m_g42 ... sale_sum_6m_g24 sale_sum_6m_g25 \\\n", + "0 1.0 ... 3141.25 356.67 \n", + "1 0.0 ... 113.39 62.69 \n", + "2 7.0 ... 1239.19 533.46 \n", + "3 1.0 ... 139.68 1849.91 \n", + "4 2.0 ... 226.98 168.05 \n", + "687024 3.0 ... 550.09 669.33 \n", + "687025 0.0 ... 0.00 0.00 \n", + "687026 0.0 ... 0.00 0.00 \n", + "687027 1.0 ... 0.00 46.72 \n", + "687028 2.0 ... 290.01 0.00 \n", + "\n", + " sale_sum_6m_g26 sale_sum_6m_g32 sale_sum_6m_g33 sale_sum_6m_g44 \\\n", + "0 237.25 283.84 3648.23 1195.37 \n", + "1 58.71 87.01 179.83 0.00 \n", + "2 83.37 593.13 1217.43 1336.83 \n", + "3 360.40 175.73 496.73 172.58 \n", + "4 461.37 0.00 237.93 225.51 \n", + "687024 111.87 0.00 330.96 1173.84 \n", + "687025 0.00 0.00 0.00 0.00 \n", + "687026 0.00 0.00 0.00 449.01 \n", + "687027 0.00 0.00 0.00 0.00 \n", + "687028 0.00 0.00 228.47 752.32 \n", + "\n", + " sale_sum_6m_g54 stdev_days_between_visits_15d \\\n", + "0 535.42 1.7078 \n", + "1 122.98 0.0000 \n", + "2 3709.82 0.0000 \n", + "3 1246.21 0.0000 \n", + "4 995.27 1.4142 \n", + "687024 119.99 2.6458 \n", + "687025 28.01 0.0000 \n", + "687026 0.00 0.0000 \n", + "687027 0.00 0.0000 \n", + "687028 596.86 0.0000 \n", + "\n", + " stdev_discount_depth_15d stdev_discount_depth_1m \n", + "0 0.2798 0.3008 \n", + "1 0.0000 0.0000 \n", + "2 NaN 0.0803 \n", + "3 0.0000 0.0000 \n", + "4 0.3495 0.3495 \n", + "687024 0.3646 0.3282 \n", + "687025 0.0000 0.0000 \n", + "687026 NaN NaN \n", + "687027 NaN NaN \n", + "687028 0.0000 0.0000 \n", + "\n", + "[10 rows x 193 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.head().append(dataset.data.tail())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### πŸ€” target share for `treatment / control` " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:53.356948Z", + "start_time": "2021-02-19T20:20:53.193956Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
response_att01
group
control0.8974210.102579
test0.8898740.110126
\n", + "
" + ], + "text/plain": [ + "response_att 0 1\n", + "group \n", + "control 0.897421 0.102579\n", + "test 0.889874 0.110126" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd \n", + "\n", + "pd.crosstab(dataset.treatment, dataset.target, normalize='index')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:53.427973Z", + "start_time": "2021-02-19T20:20:53.361728Z" + } + }, + "outputs": [], + "source": [ + "# make treatment binary\n", + "treat_dict = {\n", + " 'test': 1,\n", + " 'control': 0\n", + "}\n", + "\n", + "dataset.treatment = dataset.treatment.map(treat_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:20:53.545895Z", + "start_time": "2021-02-19T20:20:53.430192Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Π– 433448\n", + "М 243910\n", + "НС ΠΎΠΏΡ€Π΅Π΄Π΅Π»Π΅Π½ 9671\n", + "Name: gender, dtype: int64\n" + ] + } + ], + "source": [ + "# fill NaNs in the categorical feature `gender` \n", + "# for CatBoostClassifier\n", + "dataset.data['gender'] = dataset.data['gender'].fillna(value='НС ΠΎΠΏΡ€Π΅Π΄Π΅Π»Π΅Π½')\n", + "\n", + "print(dataset.data['gender'].value_counts(dropna=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### βœ‚οΈ train test split\n", + "\n", + "- stratify by two columns: treatment and target. \n", + "\n", + "`Intuition:` In a binary classification problem definition we stratify train set by splitting target `0/1` column. In uplift modeling we have two columns instead of one. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:21:03.159534Z", + "start_time": "2021-02-19T20:20:53.548597Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train shape: (480920, 193)\n", + "Validation shape: (206109, 193)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)\n", + "\n", + "X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(\n", + " dataset.data,\n", + " dataset.treatment,\n", + " dataset.target,\n", + " stratify=stratify_cols,\n", + " test_size=0.3,\n", + " random_state=42\n", + ")\n", + "\n", + "print(f\"Train shape: {X_train.shape}\")\n", + "print(f\"Validation shape: {X_val.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-07T17:47:46.311346Z", + "start_time": "2020-06-07T17:47:46.293688Z" + } + }, + "source": [ + "# πŸ‘Ύ Class Transformation uplift model\n", + "\n", + "`Class transformation` method is described here \n", + "\n", + "Class transormation method `may` be used in case of treatment unbalanced data. In this case one will get not an uplift score but some *ranking* score still useful for ranking objects." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:21:03.204975Z", + "start_time": "2021-02-19T20:21:03.165305Z" + } + }, + "outputs": [], + "source": [ + "from sklift.models import ClassTransformation\n", + "from catboost import CatBoostClassifier\n", + "\n", + "estimator = CatBoostClassifier(verbose=100, \n", + " cat_features=['gender'],\n", + " random_state=42,\n", + " thread_count=1)\n", + "\n", + "ct_model = ClassTransformation(estimator=estimator)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:37:38.852200Z", + "start_time": "2021-02-19T20:21:03.208181Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/macdrive/GoogleDrive/ΠŸΡ€ΠΎΠ΅ΠΊΡ‚Ρ‹/Uplift/sklift-env/lib/python3.7/site-packages/ipykernel_launcher.py:4: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " after removing the cwd from sys.path.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Learning rate set to 0.143939\n", + "0:\tlearn: 0.6685632\ttotal: 1.03s\tremaining: 17m 6s\n", + "100:\tlearn: 0.5948982\ttotal: 1m 34s\tremaining: 14m\n", + "200:\tlearn: 0.5907078\ttotal: 3m 16s\tremaining: 13m 3s\n", + "300:\tlearn: 0.5869612\ttotal: 4m 51s\tremaining: 11m 16s\n", + "400:\tlearn: 0.5835421\ttotal: 6m 35s\tremaining: 9m 51s\n", + "500:\tlearn: 0.5801981\ttotal: 8m 31s\tremaining: 8m 29s\n", + "600:\tlearn: 0.5769677\ttotal: 10m 13s\tremaining: 6m 47s\n", + "700:\tlearn: 0.5737862\ttotal: 11m 44s\tremaining: 5m\n", + "800:\tlearn: 0.5706947\ttotal: 13m 37s\tremaining: 3m 23s\n", + "900:\tlearn: 0.5677125\ttotal: 15m 7s\tremaining: 1m 39s\n", + "999:\tlearn: 0.5648426\ttotal: 16m 29s\tremaining: 0us\n" + ] + }, + { + "data": { + "text/plain": [ + "ClassTransformation(estimator=)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ct_model.fit(\n", + " X=X_train, \n", + " y=y_train, \n", + " treatment=trmnt_train\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:37:38.925679Z", + "start_time": "2021-02-19T20:37:38.857315Z" + } + }, + "outputs": [], + "source": [ + "import dill\n", + "\n", + "with open(\"model.dill\", 'wb') as f:\n", + " dill.dump(ct_model, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Uplift prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-19T20:37:39.642947Z", + "start_time": "2021-02-19T20:37:38.928915Z" + } + }, + "outputs": [], + "source": [ + "uplift_ct = ct_model.predict(X_val)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# πŸš€πŸš€πŸš€ Uplift metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸš€ `uplift@k`\n", + "\n", + "- uplift at first k%\n", + "- usually falls between [0; 1] depending on k, model quality and data\n", + "\n", + "\n", + "### `uplift@k` = `target mean at k% in the treatment group` - `target mean at k% in the control group`\n", + "\n", + "___\n", + "\n", + "How to count `uplift@k`:\n", + "\n", + "1. sort by predicted uplift\n", + "2. select first k%\n", + "3. count target mean in the treatment group\n", + "4. count target mean in the control group\n", + "5. substract the mean in the control group from the mean in the treatment group\n", + "\n", + "---\n", + "\n", + "Code parameter options:\n", + "\n", + "- `strategy='overall'` - sort by uplift treatment and control together\n", + "- `strategy='by_group'` - sort by uplift treatment and control separately" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:36:32.767618Z", + "start_time": "2021-02-18T19:36:32.618887Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "uplift@10%: 0.1467 (sort groups by uplift together)\n", + "uplift@10%: 0.1503 (sort groups by uplift separately)\n" + ] + } + ], + "source": [ + "from sklift.metrics import uplift_at_k\n", + "\n", + "# k = 10%\n", + "k = 0.1 \n", + "\n", + "# strategy='overall' sort by uplift treatment and control together\n", + "uplift_overall = uplift_at_k(y_val, uplift_ct, trmnt_val, strategy='overall', k=k)\n", + "\n", + "# strategy='by_group' sort by uplift treatment and control separately\n", + "uplift_bygroup = uplift_at_k(y_val, uplift_ct, trmnt_val, strategy='by_group', k=k)\n", + "\n", + "\n", + "print(f\"uplift@{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)\")\n", + "print(f\"uplift@{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸš€ `uplift_by_percentile` table\n", + "\n", + "Count metrics for each percentile in data in descending order by uplift prediction (by rows):\n", + "\n", + "- `n_treatment` - treatment group size in the one percentile\n", + "- `n_control` - control group size in the one perentile\n", + "- `response_rate_treatment` - target mean in the treatment group in the one percentile\n", + "- `response_rate_control` - target mean in the control group in the one percentile\n", + "- `uplift = response_rate_treatment - response_rate_control` in the one percentile\n", + "\n", + "___\n", + "\n", + "Code parameter options are:\n", + "\n", + "- `strategy='overall'` - sort by uplift treatment and control groups together\n", + "- `strategy='by_group'` - sort by uplift treatment and control groups separately\n", + "- `total=True` - show total metric on full data\n", + "- `std=True` - show metrics std by row " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:36:49.986630Z", + "start_time": "2021-02-18T19:36:49.766425Z" + }, + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_treatmentn_controlresponse_rate_treatmentresponse_rate_controlupliftstd_treatmentstd_controlstd_uplift
percentile
0-101271578960.3663390.2196050.1467340.0042730.0046590.006321
10-201556050510.2142670.1977830.0164850.0032890.0056050.006499
20-301568349280.1495250.1306820.0188430.0028480.0048010.005582
30-401567549360.1113880.0986630.0127250.0025130.0042450.004933
40-501579848130.0827950.0779140.0048810.0021920.0038640.004442
50-601577648350.0625000.0579110.0045890.0019270.0033590.003873
60-701576848430.0515600.0501760.0013850.0017610.0031370.003597
70-801579348180.0421710.0348690.0073010.0015990.0026430.003089
80-901588447270.0351930.0315210.0036720.0014620.0025410.002932
90-1001611644940.0390300.041611-0.0025820.0015260.0029790.003347
total154768513410.1101260.1025690.0075570.0233900.0378320.044615
\n", + "
" + ], + "text/plain": [ + " n_treatment n_control response_rate_treatment \\\n", + "percentile \n", + "0-10 12715 7896 0.366339 \n", + "10-20 15560 5051 0.214267 \n", + "20-30 15683 4928 0.149525 \n", + "30-40 15675 4936 0.111388 \n", + "40-50 15798 4813 0.082795 \n", + "50-60 15776 4835 0.062500 \n", + "60-70 15768 4843 0.051560 \n", + "70-80 15793 4818 0.042171 \n", + "80-90 15884 4727 0.035193 \n", + "90-100 16116 4494 0.039030 \n", + "total 154768 51341 0.110126 \n", + "\n", + " response_rate_control uplift std_treatment std_control \\\n", + "percentile \n", + "0-10 0.219605 0.146734 0.004273 0.004659 \n", + "10-20 0.197783 0.016485 0.003289 0.005605 \n", + "20-30 0.130682 0.018843 0.002848 0.004801 \n", + "30-40 0.098663 0.012725 0.002513 0.004245 \n", + "40-50 0.077914 0.004881 0.002192 0.003864 \n", + "50-60 0.057911 0.004589 0.001927 0.003359 \n", + "60-70 0.050176 0.001385 0.001761 0.003137 \n", + "70-80 0.034869 0.007301 0.001599 0.002643 \n", + "80-90 0.031521 0.003672 0.001462 0.002541 \n", + "90-100 0.041611 -0.002582 0.001526 0.002979 \n", + "total 0.102569 0.007557 0.023390 0.037832 \n", + "\n", + " std_uplift \n", + "percentile \n", + "0-10 0.006321 \n", + "10-20 0.006499 \n", + "20-30 0.005582 \n", + "30-40 0.004933 \n", + "40-50 0.004442 \n", + "50-60 0.003873 \n", + "60-70 0.003597 \n", + "70-80 0.003089 \n", + "80-90 0.002932 \n", + "90-100 0.003347 \n", + "total 0.044615 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklift.metrics import uplift_by_percentile\n", + "\n", + "uplift_by_percentile(y_val, uplift_ct, trmnt_val, \n", + " strategy='overall', \n", + " total=True, std=True, bins=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸš€ `weighted average uplift `\n", + "\n", + "- counts uplift on full data\n", + "- uses results from `uplift_by_percentile` table\n", + "- result depends on number of bins\n", + "\n", + "### `weighted average uplift` = `sum of uplift by percentile weighted on the treatment group size`\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:36:55.281993Z", + "start_time": "2021-02-18T19:36:55.170863Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "average uplift on full data: 0.0189\n" + ] + } + ], + "source": [ + "from sklift.metrics import weighted_average_uplift\n", + "\n", + "uplift_full_data = weighted_average_uplift(y_val, uplift_ct, trmnt_val, bins=10) \n", + "print(f\"average uplift on full data: {uplift_full_data:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸš€ `uplift_by_percentile` plot\n", + "\n", + "- visualize results of `uplift_by_percentile` table\n", + "\n", + "Two ways to plot:\n", + "\n", + "- line plot `kind='line'`\n", + "- bar plot `kind='bar'`\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:01.771296Z", + "start_time": "2021-02-18T19:37:00.777556Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklift.viz import plot_uplift_by_percentile\n", + "\n", + "# line plot\n", + "plot_uplift_by_percentile(y_val, uplift_ct, trmnt_val, strategy='overall', kind='line');" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:05.278805Z", + "start_time": "2021-02-18T19:37:04.673485Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# bar plot\n", + "plot_uplift_by_percentile(y_val, uplift_ct, trmnt_val, strategy='overall', kind='bar');" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸš€ `Qini curve` \n", + "\n", + "The curve plots the absolute incremental outcome of the treated group compared to group with no treatment. \n", + "\n", + "\n", + "plot Qini curve: \n", + "- `blue line` is a `real Qini curve` based on data.\n", + "- `red line` is an `ideal Qini curve` based on data. Code: `perfect=True`\n", + "- `grey line` is a `random Qini curve` based on data\n", + " \n", + "\n", + "## πŸš€ `AUQC` (`area under Qini curve` or `Qini coefficient`)\n", + "\n", + "`Qini coefficient` = `light blue area between the real Qini curve and the random Qini curve normalized on area between the random and the ideal line`\n", + "\n", + "\"qini_curve\"\n", + "\n", + "\n", + "- metric is printed at the title of the Qini curve plot\n", + "- can be called as a separate function" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:13.035790Z", + "start_time": "2021-02-18T19:37:12.272926Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklift.viz import plot_qini_curve\n", + "\n", + "# with ideal Qini curve (red line)\n", + "# perfect=True\n", + "plot_qini_curve(y_val, uplift_ct, trmnt_val, perfect=True);" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:17.212317Z", + "start_time": "2021-02-18T19:37:16.455878Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# no ideal Qini curve\n", + "# only real Qini curve\n", + "# perfect=False\n", + "plot_qini_curve(y_val, uplift_ct, trmnt_val, perfect=False);" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:20.543940Z", + "start_time": "2021-02-18T19:37:20.415593Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Qini coefficient on full data: 0.0695\n" + ] + } + ], + "source": [ + "from sklift.metrics import qini_auc_score\n", + "\n", + "# AUQC = area under Qini curve = Qini coefficient\n", + "auqc = qini_auc_score(y_val, uplift_ct, trmnt_val) \n", + "print(f\"Qini coefficient on full data: {auqc:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸš€ `Uplift curve` \n", + "\n", + "The Uplift curve plots incremental uplift.\n", + "\n", + "\n", + " - `blue line` is a `real Uplift curve` based on data. \n", + " - `red line` is an `ideal Uplift curve` based on data. Code: `perfect=True`\n", + " - `grey line` is a `random Uplift curve` based on data.\n", + " \n", + "\n", + "## πŸš€ `AUUQ` (`area under uplift curve`)\n", + "\n", + "- `Area under uplift curve` = blue area between the real Uplift curve and the random Uplift curve \n", + " - appears at the title of the Uplift curve plot\n", + " - can be called as a separate function\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:23.685370Z", + "start_time": "2021-02-18T19:37:22.944633Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklift.viz import plot_uplift_curve\n", + "\n", + "# with ideal curve\n", + "# perfect=True\n", + "plot_uplift_curve(y_val, uplift_ct, trmnt_val, perfect=True);" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:26.670587Z", + "start_time": "2021-02-18T19:37:25.951757Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# only real\n", + "# perfect=False\n", + "plot_uplift_curve(y_val, uplift_ct, trmnt_val, perfect=False);" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-18T19:37:29.004818Z", + "start_time": "2021-02-18T19:37:28.871390Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uplift auc score on full data: 0.0422\n" + ] + } + ], + "source": [ + "from sklift.metrics import uplift_auc_score\n", + "\n", + "# AUUQ = area under uplift curve\n", + "auuc = uplift_auc_score(y_val, uplift_ct, trmnt_val) \n", + "print(f\"Uplift auc score on full data: {auuc:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sklift-env", + "language": "python", + "name": "sklift-env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/setup.py b/setup.py index efadef8..e8b2933 100644 --- a/setup.py +++ b/setup.py @@ -77,6 +77,7 @@ def run(self): maintainer=MAINTAINER, url=URL, packages=find_packages(exclude=["tests", "docs", "images"]), + include_package_data=True, install_requires=REQUIRED, extras_require=EXTRAS, classifiers=[ diff --git a/sklift/__init__.py b/sklift/__init__.py index 0404d81..e1424ed 100644 --- a/sklift/__init__.py +++ b/sklift/__init__.py @@ -1 +1 @@ -__version__ = '0.3.0' +__version__ = '0.3.1' diff --git a/sklift/datasets/datasets.py b/sklift/datasets/datasets.py index 0451482..4af27f0 100644 --- a/sklift/datasets/datasets.py +++ b/sklift/datasets/datasets.py @@ -101,7 +101,7 @@ def clear_data_dir(path=None): shutil.rmtree(path, ignore_errors=True) -def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, return_X_y_t=False, as_frame=True): +def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, return_X_y_t=False): """Load and return the Lenta dataset (classification). An uplift modeling dataset containing data about Lenta's customers grociery shopping and @@ -122,8 +122,6 @@ def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, retu dest_subdir (str): The name of the folder in which the dataset is stored. download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing. return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object. - as_frame (bool): If True, returns a pandas Dataframe or Series for the data, target and treatment objects - in the Bunch returned object; Bunch return object will also have a frame member. Returns: Bunch or tuple: dataset. @@ -131,7 +129,7 @@ def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, retu Bunch: By default dictionary-like object, with the following attributes: - * ``data`` (ndarray or DataFrame object): Dataset without target and treatment. + * ``data`` (DataFrame object): Dataset without target and treatment. * ``target`` (Series object): Column target by values. * ``treatment`` (Series object): Column treatment by values. * ``DESCR`` (str): Description of the Lenta dataset. @@ -144,56 +142,49 @@ def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, retu """ - url = 'https:/winterschool123.s3.eu-north-1.amazonaws.com/lentadataset.csv.gz' - filename = 'lentadataset.csv.gz' - + url = 'https://winterschool123.s3.eu-north-1.amazonaws.com/lentadataset.csv.gz' + filename = url.split('/')[-1] csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir, dest_filename=filename, download_if_missing=download_if_missing) + target_col = 'response_att' + treatment_col = 'group' + data = pd.read_csv(csv_path) - if as_frame: - target = data['response_att'] - treatment = data['group'] - data = data.drop(['response_att', 'group'], axis=1) - feature_names = list(data.columns) - else: - target = data[['response_att']].to_numpy() - treatment = data[['group']].to_numpy() - data = data.drop(['response_att', 'group'], axis=1) - feature_names = list(data.columns) - data = data.to_numpy() + treatment, target = data[treatment_col], data[target_col] + + data = data.drop([target_col, treatment_col], axis=1) + feature_names = list(data.columns) + + if return_X_y_t: + return data, target, treatment module_path = os.path.dirname(__file__) with open(os.path.join(module_path, 'descr', 'lenta.rst')) as rst_file: fdescr = rst_file.read() - - if return_X_y_t: - return data, target, treatment - + return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, - feature_names=feature_names, target_name='response_att', treatment_name='group') + feature_names=feature_names, target_name=target_col, treatment_name=treatment_col) -def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True, as_frame=True): +def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True): """Load and return the X5 RetailHero dataset (classification). - The dataset contains raw retail customer purchaces, raw information about products and general info about customers. + The dataset contains raw retail customer purchases, raw information about products and general info about customers. Major columns: - ``treatment_flg`` (binary): treatment/control group flag - ``target`` (binary): target - - ``customer_id`` (str): customer id aka primary key for joining + - ``customer_id`` (str): customer id - primary key for joining Read more in the :ref:`docs `. Args: data_home (str, unicode): The path to the folder where datasets are stored. dest_subdir (str, unicode): The name of the folder in which the dataset is stored. - download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing. - as_frame (bool): If True, returns a pandas Dataframe or Series for the data, target and treatment objects - in the Bunch returned object; Bunch return object will also have a frame member. + download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing Returns: Bunch: dataset. @@ -214,56 +205,53 @@ def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True, as_fram References: https://ods.ai/competitions/x5-retailhero-uplift-modeling/data + """ + url_train = 'https://timds.s3.eu-central-1.amazonaws.com/uplift_train.csv.gz' + file_train = url_train.split('/')[-1] + csv_train_path = _get_data(data_home=data_home, url=url_train, dest_subdir=dest_subdir, + dest_filename=file_train, + download_if_missing=download_if_missing) + train = pd.read_csv(csv_train_path) + train_features = list(train.columns) + + target_col = 'target' + treatment_col = 'treatment_flg' + + treatment, target = train[treatment_col], train[target_col] + + train = train.drop([target_col, treatment_col], axis=1) url_clients = 'https://timds.s3.eu-central-1.amazonaws.com/clients.csv.gz' - file_clients = 'clients.csv.gz' + file_clients = url_clients.split('/')[-1] csv_clients_path = _get_data(data_home=data_home, url=url_clients, dest_subdir=dest_subdir, dest_filename=file_clients, download_if_missing=download_if_missing) clients = pd.read_csv(csv_clients_path) - clients_names = list(clients.column) - - url_train = 'https://timds.s3.eu-central-1.amazonaws.com/uplift_train.csv.gz' - file_train = 'uplift_train.csv.gz' - csv_train_path = _get_data(data_home=data_home, url=url_train, dest_subdir=dest_subdir, - dest_filename=file_train, - download_if_missing=download_if_missing) - train = pd.read_csv(csv_train_path) - train_names = list(train.columns) + clients_features = list(clients.column) url_purchases = 'https://timds.s3.eu-central-1.amazonaws.com/purchases.csv.gz' - file_purchases = 'purchases.csv.gz' + file_purchases = url_purchases.split('/')[-1] csv_purchases_path = _get_data(data_home=data_home, url=url_purchases, dest_subdir=dest_subdir, dest_filename=file_purchases, download_if_missing=download_if_missing) purchases = pd.read_csv(csv_purchases_path) - purchases_names = list(purchases.columns) - - if as_frame: - target = train['target'] - treatment = train['treatment_flg'] - else: - target = train[['target']].to_numpy() - treatment = train[['treatment_flg']].to_numpy() - train = train.to_numpy() - clients = clients.to_numpy() - purchases = purchases.to_numpy() + purchases_features = list(purchases.columns) data = Bunch(clients=clients, train=train, purchases=purchases) - data_names = Bunch(clients_names=clients_names, train_names=train_names, - purchases_names=purchases_names) + feature_names = Bunch(train_features=train_features, clients_features=clients_features, + purchases_features=purchases_features) module_path = os.path.dirname(__file__) with open(os.path.join(module_path, 'descr', 'x5.rst')) as rst_file: fdescr = rst_file.read() - return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, - data_names=data_names, target_name='target', treatment_name='treatment_flg') + return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, + feature_names=feature_names, target_name='target', treatment_name='treatment_flg') def fetch_criteo(target_col='visit', treatment_col='treatment', data_home=None, dest_subdir=None, - download_if_missing=True, percent10=True, return_X_y_t=False, as_frame=True): + download_if_missing=True, percent10=False, return_X_y_t=False): """Load and return the Criteo Uplift Prediction Dataset (classification). This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized @@ -280,18 +268,16 @@ def fetch_criteo(target_col='visit', treatment_col='treatment', data_home=None, Read more in the :ref:`docs `. Args: - target_col (string, 'visit' or 'conversion', default='visit'): Selects which column from dataset - will be target. - treatment_col (string,'treatment' or 'exposure' default='treatment'): Selects which column from dataset - will be treatment. + target_col (string, 'visit', 'conversion' or 'all', default='visit'): Selects which column from dataset + will be target. If 'all', return a DataFrame with all targets cols. + treatment_col (string,'treatment', 'exposure' or 'all', default='treatment'): Selects which column from dataset + will be treatment. If 'all', return a DataFrame with all treatment cols. data_home (string): Specify a download and cache folder for the datasets. dest_subdir (string): The name of the folder in which the dataset is stored. download_if_missing (bool, default=True): If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. - percent10 (bool, default=True): Whether to load only 10 percent of the data. + percent10 (bool, default=False): Whether to load only 10 percent of the data. return_X_y_t (bool, default=False): If True, returns (data, target, treatment) instead of a Bunch object. - as_frame (bool): If True, returns a pandas Dataframe or Series for the data, target and treatment objects - in the Bunch returned object; Bunch return object will also have a frame member. Returns: Bunch or tuple: dataset. @@ -299,13 +285,13 @@ def fetch_criteo(target_col='visit', treatment_col='treatment', data_home=None, Bunch: By default dictionary-like object, with the following attributes: - * ``data`` (ndarray or DataFrame object): Dataset without target and treatment. - * ``target`` (Series object): Column target by values. - * ``treatment`` (Series object): Column treatment by values. + * ``data`` (DataFrame object): Dataset without target and treatment. + * ``target`` (Series or DataFrame object): Column target by values. + * ``treatment`` (Series or DataFrame object): Column treatment by values. * ``DESCR`` (str): Description of the Lenta dataset. * ``feature_names`` (list): Names of the features. - * ``target_name`` (str): Name of the target. - * ``treatment_name`` (str): Name of the treatment. + * ``target_name`` (str list): Name of the target. + * ``treatment_name`` (str or list): Name of the treatment. Tuple: tuple (data, target, treatment) if `return_X_y` is True @@ -314,67 +300,56 @@ def fetch_criteo(target_col='visit', treatment_col='treatment', data_home=None, β€œA Large Scale Benchmark for Uplift Modeling” Eustache Diemert, Artem Betlei, Christophe Renaudin; (Criteo AI Lab), Massih-Reza Amini (LIG, Grenoble INP) """ + treatment_cols = ['exposure', 'treatment'] + if treatment_col == 'all': + treatment_col = treatment_cols + elif treatment_col not in treatment_cols: + raise ValueError(f"treatment_col value must be in {treatment_cols + ['all']}. " + f"Got value {treatment_col}.") + + target_cols = ['visit', 'conversion'] + if target_col == 'all': + target_col = target_cols + elif target_col not in target_cols: + raise ValueError(f"target_col value must be from {target_cols + ['all']}. " + f"Got value {target_col}.") + if percent10: url = 'https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo10.csv.gz' - csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir, - dest_filename='criteo10.csv.gz', - download_if_missing=download_if_missing) else: url = "https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo.csv.gz" - csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir, - dest_filename='criteo.csv.gz', - download_if_missing=download_if_missing) - - if treatment_col == 'exposure': - data = pd.read_csv(csv_path, usecols=[i for i in range(12)]) - treatment = pd.read_csv(csv_path, usecols=['exposure'], dtype={'exposure': 'Int8'}) - if as_frame: - treatment = treatment['exposure'] - elif treatment_col == 'treatment': - data = pd.read_csv(csv_path, usecols=[i for i in range(12)]) - treatment = pd.read_csv(csv_path, usecols=['treatment'], dtype={'treatment': 'Int8'}) - if as_frame: - treatment = treatment['treatment'] - else: - raise ValueError(f"treatment_col value must be from {['treatment', 'exposure']}. " - f"Got value {treatment_col}.") - feature_names = list(data.columns) - if target_col == 'conversion': - target = pd.read_csv(csv_path, usecols=['conversion'], dtype={'conversion': 'Int8'}) - if as_frame: - target = target['conversion'] - elif target_col == 'visit': - target = pd.read_csv(csv_path, usecols=['visit'], dtype={'visit': 'Int8'}) - if as_frame: - target = target['visit'] - else: - raise ValueError(f"target_col value must be from {['visit', 'conversion']}. " - f"Got value {target_col}.") + filename = url.split('/')[-1] + csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir, + dest_filename=filename, + download_if_missing=download_if_missing) + + dtypes = { + 'exposure': 'Int8', + 'treatment': 'Int8', + 'conversion': 'Int8', + 'visit': 'Int8' + } + data = pd.read_csv(csv_path, dtype=dtypes) + treatment, target = data[treatment_col], data[target_col] + + data = data.drop(target_cols + treatment_cols, axis=1) if return_X_y_t: - if as_frame: - return data, target, treatment - else: - return data.to_numpy(), target.to_numpy(), treatment.to_numpy() - else: - target_name = target_col - treatment_name = treatment_col + return data, target, treatment + + feature_names = list(data.columns) module_path = os.path.dirname(__file__) with open(os.path.join(module_path, 'descr', 'criteo.rst')) as rst_file: fdescr = rst_file.read() - if as_frame: - return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, feature_names=feature_names, - target_name=target_name, treatment_name=treatment_name) - else: - return Bunch(data=data.to_numpy(), target=target.to_numpy(), treatment=treatment.to_numpy(), DESCR=fdescr, - feature_names=feature_names, target_name=target_name, treatment_name=treatment_name) + return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, feature_names=feature_names, + target_name=target_col, treatment_name=treatment_col) def fetch_hillstrom(target_col='visit', data_home=None, dest_subdir=None, download_if_missing=True, - return_X_y_t=False, as_frame=True): + return_X_y_t=False): """Load and return Kevin Hillstrom Dataset MineThatData (classification or regression). This dataset contains 64,000 customers who last purchased within twelve months. @@ -382,22 +357,20 @@ def fetch_hillstrom(target_col='visit', data_home=None, dest_subdir=None, downlo Major columns: - * ``Visit`` (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks. - * ``Conversion`` (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks. - * ``Spend`` (float): target. Actual dollars spent in the following two weeks. - * ``Segment`` (str): treatment. The e-mail campaign the customer received + * ``visit`` (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks. + * ``conversion`` (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks. + * ``spend`` (float): target. Actual dollars spent in the following two weeks. + * ``segment`` (str): treatment. The e-mail campaign the customer received Read more in the :ref:`docs `. Args: - target_col (string, 'visit' or 'conversion' or 'spend', default='visit'): Selects which column from dataset + target_col (string, 'visit' or 'conversion', 'spend' or 'all', default='visit'): Selects which column from dataset will be target data_home (str): The path to the folder where datasets are stored. dest_subdir (str): The name of the folder in which the dataset is stored. download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing. return_X_y_t (bool, default=False): If True, returns (data, target, treatment) instead of a Bunch object. - as_frame (bool): If True, returns a pandas Dataframe for the data, target and treatment objects - in the Bunch returned object; Bunch return object will also have a frame member. Returns: Bunch or tuple: dataset. @@ -405,12 +378,12 @@ def fetch_hillstrom(target_col='visit', data_home=None, dest_subdir=None, downlo Bunch: By default dictionary-like object, with the following attributes: - * ``data`` (ndarray or DataFrame object): Dataset without target and treatment. - * ``target`` (Series object): Column target by values. + * ``data`` (DataFrame object): Dataset without target and treatment. + * ``target`` (Series or DataFrame object): Column target by values. * ``treatment`` (Series object): Column treatment by values. * ``DESCR`` (str): Description of the Lenta dataset. * ``feature_names`` (list): Names of the features. - * ``target_name`` (str): Name of the target. + * ``target_name`` (str or list): Name of the target. * ``treatment_name`` (str): Name of the treatment. Tuple: @@ -420,37 +393,34 @@ def fetch_hillstrom(target_col='visit', data_home=None, dest_subdir=None, downlo https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html """ + target_cols = ['visit', 'conversion', 'spend'] + if target_col == 'all': + target_col = target_cols + elif target_col not in target_cols: + raise ValueError(f"target_col value must be from {target_cols + ['all']}. " + f"Got value {target_col + ['all']}.") url = 'https://hillstorm1.s3.us-east-2.amazonaws.com/hillstorm_no_indices.csv.gz' - csv_path = _get_data(data_home=data_home, - url=url, - dest_subdir=dest_subdir, - dest_filename='hillstorm_no_indices.csv.gz', - download_if_missing=download_if_missing) - - if target_col != ('visit' or 'conversion' or 'spend'): - raise ValueError(f"target_col value must be from {['visit', 'conversion', 'spend']}. " - f"Got value {target_col}.") + filename = url.split('/')[-1] + csv_path = _get_data(data_home=data_home, url=url, dest_subdir=dest_subdir, + dest_filename=filename, + download_if_missing=download_if_missing) + + treatment_col = 'segment' + + data = pd.read_csv(csv_path) + treatment, target = data[treatment_col], data[target_col] + + data = data.drop(target_cols + [treatment_col], axis=1) + + if return_X_y_t: + return data, target, treatment - data = pd.read_csv(csv_path, usecols=[i for i in range(8)]) feature_names = list(data.columns) - treatment = pd.read_csv(csv_path, usecols=['segment']) - target = pd.read_csv(csv_path, usecols=[target_col]) - if as_frame: - target = target[target_col] - treatment = treatment['segment'] - else: - data = data.to_numpy() - target = target.to_numpy() - treatment = treatment.to_numpy() - + module_path = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(module_path, 'descr', 'hillstrom.rst')) as rst_file: fdescr = rst_file.read() - - if return_X_y_t: - return data, target, treatment - else: - target_name = target_col - return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, - feature_names=feature_names, target_name=target_name, treatment_name='segment') + + return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, + feature_names=feature_names, target_name=target_col, treatment_name=treatment_col) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index e40a913..63566ef 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -540,7 +540,8 @@ def weighted_average_uplift(y_true, uplift, treatment, strategy='overall', bins= return weighted_avg_uplift -def uplift_by_percentile(y_true, uplift, treatment, strategy='overall', bins=10, std=False, total=False): +def uplift_by_percentile(y_true, uplift, treatment, strategy='overall', + bins=10, std=False, total=False, string_percentiles=True): """Compute metrics: uplift, group size, group response rate, standard deviation at each percentile. Metrics in columns and percentiles in rows of pandas DataFrame: @@ -571,6 +572,7 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall', bins=10, The total uplift is a weighted average uplift. See :func:`.weighted_average_uplift`. The total response rate is a response rate on the full data amount. bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10. + string_percentiles (bool): type of percentiles in the index: float or string. Default is True (string). Returns: pandas.DataFrame: DataFrame where metrics are by columns and percentiles are by rows. @@ -602,6 +604,10 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall', bins=10, if bins >= n_samples: raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') + if not isinstance(string_percentiles, bool): + raise ValueError(f'string_percentiles flag should be bool: True or False.' + f' Invalid value string_percentiles: {string_percentiles}') + y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile( @@ -613,7 +619,12 @@ def uplift_by_percentile(y_true, uplift, treatment, strategy='overall', bins=10, uplift_scores = response_rate_trmnt - response_rate_ctrl uplift_variance = variance_trmnt + variance_ctrl - percentiles = [round(p * 100 / bins, 1) for p in range(1, bins + 1)] + percentiles = [round(p * 100 / bins) for p in range(1, bins + 1)] + + if string_percentiles: + percentiles = [f"0-{percentiles[0]}"] + \ + [f"{percentiles[i]}-{percentiles[i + 1]}" for i in range(len(percentiles) - 1)] + df = pd.DataFrame({ 'percentile': percentiles, diff --git a/sklift/tests/test_datasets.py b/sklift/tests/test_datasets.py new file mode 100644 index 0000000..fe21665 --- /dev/null +++ b/sklift/tests/test_datasets.py @@ -0,0 +1,61 @@ +import pytest + +from functools import partial + +from ..datasets import ( + fetch_hillstrom, fetch_lenta, fetch_criteo +) + + +fetch_criteo10 = partial(fetch_criteo, percent10=True) + + +def check_return_X_y_t(bunch, dataset_func): + X_y_t_tuple = dataset_func(return_X_y_t=True) + assert isinstance(X_y_t_tuple, tuple) + assert X_y_t_tuple[0].shape == bunch.data.shape + assert X_y_t_tuple[1].shape == bunch.target.shape + assert X_y_t_tuple[2].shape == bunch.treatment.shape + + +@pytest.mark.parametrize( + 'target_col, target_shape', + [('visit', (64_000,)), + ('conversion', (64_000,)), + ('spend', (64_000,)), + ('all', (64_000, 3))] +) +def test_fetch_hillstrom( + target_col, target_shape +): + data = fetch_hillstrom(target_col=target_col) + assert data.data.shape == (64_000, 8) + assert data.target.shape == target_shape + assert data.treatment.shape == (64_000,) + + +@pytest.mark.parametrize( + 'target_col, target_shape', + [('visit', (1397960,)), + ('conversion', (1397960,)), + ('all', (1397960, 2))] +) +@pytest.mark.parametrize( + 'treatment_col, treatment_shape', + [('exposure', (1397960,)), + ('treatment', (1397960,)), + ('all', (1397960, 2))] +) +def test_fetch_criteo10( + target_col, target_shape, treatment_col, treatment_shape +): + data = fetch_criteo10(target_col=target_col, treatment_col=treatment_col) + assert data.data.shape == (1397960, 12) + assert data.target.shape == target_shape + assert data.treatment.shape == treatment_shape + + +@pytest.mark.parametrize("fetch_func", [fetch_hillstrom, fetch_criteo10, fetch_lenta]) +def test_return_X_y_t(fetch_func): + data = fetch_func() + check_return_X_y_t(data, fetch_func) diff --git a/sklift/viz/base.py b/sklift/viz/base.py index 14340ee..6da2cc1 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -146,7 +146,8 @@ def plot_qini_curve(y_true, uplift, treatment, random=True, perfect=True, negati return ax -def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kind='line', bins=10): +def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', + kind='line', bins=10, string_percentiles=True): """Plot uplift score, treatment response rate and control response rate at each percentile. Treatment response rate ia a target mean in the treatment group. @@ -175,6 +176,7 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kin Generates a traditional bar-style plot. bins (int): Determines Π° number of bins (and the relative percentile) in the test data. Default is 10. + string_percentiles (bool): type of xticks: float or string to plot. Default is True (string). Returns: Object that stores computed values. @@ -203,8 +205,12 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kin raise ValueError( f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') + if not isinstance(string_percentiles, bool): + raise ValueError(f'string_percentiles flag should be bool: True or False.' + f' Invalid value string_percentiles: {string_percentiles}') + df = uplift_by_percentile(y_true, uplift, treatment, strategy=strategy, - std=True, total=True, bins=bins) + std=True, total=True, bins=bins, string_percentiles=False) percentiles = df.index[:bins].values.astype(float) @@ -219,7 +225,8 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kin uplift_weighted_avg = df.loc['total', 'uplift'] - check_consistent_length(percentiles, response_rate_trmnt, response_rate_ctrl, uplift_score, + check_consistent_length(percentiles, response_rate_trmnt, + response_rate_ctrl, uplift_score, std_trmnt, std_ctrl, std_uplift) if kind == 'line': @@ -235,7 +242,15 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kin if np.amin(uplift_score) < 0: axes.axhline(y=0, color='black', linewidth=1) - axes.set_xticks(percentiles) + + if string_percentiles: # string percentiles for plotting + percentiles_str = [f"0-{percentiles[0]:.0f}"] + \ + [f"{percentiles[i]:.0f}-{percentiles[i + 1]:.0f}" for i in range(len(percentiles) - 1)] + axes.set_xticks(percentiles) + axes.set_xticklabels(percentiles_str, rotation=45) + else: + axes.set_xticks(percentiles) + axes.legend(loc='upper right') axes.set_title( f'Uplift by percentile\nweighted average uplift = {uplift_weighted_avg:.4f}') @@ -245,8 +260,7 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kin else: # kind == 'bar' delta = percentiles[0] - fig, axes = plt.subplots(ncols=1, nrows=2, figsize=( - 8, 6), sharex=True, sharey=True) + fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(8, 6), sharex=True, sharey=True) fig.text(0.04, 0.5, 'Uplift = treatment response rate - control response rate', va='center', ha='center', rotation='vertical') @@ -263,7 +277,15 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy='overall', kin axes[0].set_title( f'Uplift by percentile\nweighted average uplift = {uplift_weighted_avg:.4f}') - axes[1].set_xticks(percentiles) + if string_percentiles: # string percentiles for plotting + percentiles_str = [f"0-{percentiles[0]:.0f}"] + \ + [f"{percentiles[i]:.0f}-{percentiles[i + 1]:.0f}" for i in range(len(percentiles) - 1)] + axes[1].set_xticks(percentiles) + axes[1].set_xticklabels(percentiles_str, rotation=45) + + else: + axes[1].set_xticks(percentiles) + axes[1].legend(loc='upper right') axes[1].axhline(y=0, color='black', linewidth=1) axes[1].set_xlabel('Percentile')