From 8bb0b9318970cab9cd81dd3d16de53e7f467e4f8 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 13 Oct 2023 09:49:50 +0400 Subject: [PATCH] docs(python): clarify that median is equivalent to the 50% percentile shown in `describe` metrics (#11694) --- py-polars/polars/dataframe/frame.py | 6 +++++- py-polars/polars/series/series.py | 6 +++++- py-polars/polars/utils/various.py | 8 ++++++-- py-polars/tests/unit/dataframe/test_df.py | 3 +-- py-polars/tests/unit/utils/test_utils.py | 23 +++++++++++++++-------- 5 files changed, 32 insertions(+), 14 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 312594fac845..58e7b23a3412 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4079,7 +4079,7 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]: return None def describe( - self, percentiles: Sequence[float] | float | None = (0.25, 0.75) + self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75) ) -> Self: """ Summary statistics for a DataFrame. @@ -4090,6 +4090,10 @@ def describe( One or more percentiles to include in the summary statistics. All values must be in the range `[0, 1]`. + Notes + ----- + The median is included by default as the 50% percentile. + See Also -------- glimpse diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 250e3b97ce86..b6f368e5d443 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1501,7 +1501,7 @@ def to_frame(self, name: str | None = None) -> DataFrame: return wrap_df(PyDataFrame([self._s])) def describe( - self, percentiles: Sequence[float] | float | None = (0.25, 0.75) + self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75) ) -> DataFrame: """ Quick summary statistics of a series. @@ -1515,6 +1515,10 @@ def describe( One or more percentiles to include in the summary statistics (if the series has a numeric dtype). All values must be in the range `[0, 1]`. + Notes + ----- + The median is included by default as the 50% percentile. + Returns ------- DataFrame diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index 188d2e3293fb..e535d7d0fef8 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -473,7 +473,9 @@ def in_terminal_that_supports_colour() -> bool: return False -def parse_percentiles(percentiles: Sequence[float] | float | None) -> Sequence[float]: +def parse_percentiles( + percentiles: Sequence[float] | float | None, *, inject_median: bool = False +) -> Sequence[float]: """ Transforms raw percentiles into our preferred format, adding the 50th percentile. @@ -490,7 +492,9 @@ def parse_percentiles(percentiles: Sequence[float] | float | None) -> Sequence[f sub_50_percentiles = sorted(p for p in percentiles if p < 0.5) at_or_above_50_percentiles = sorted(p for p in percentiles if p >= 0.5) - if not at_or_above_50_percentiles or at_or_above_50_percentiles[0] != 0.5: + if inject_median and ( + not at_or_above_50_percentiles or at_or_above_50_percentiles[0] != 0.5 + ): at_or_above_50_percentiles = [0.5, *at_or_above_50_percentiles] return [*sub_50_percentiles, *at_or_above_50_percentiles] diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index ed181ea6ca51..ceadea5a8d87 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1008,11 +1008,10 @@ def test_describe() -> None: ("mean", 1.3333333333333333, None, None), ("std", 0.5773502691896257, None, None), ("min", 1.0, None, None), - ("50%", 1.0, None, None), ("max", 2.0, None, None), ] - described = df.describe(percentiles=(0.2, 0.4, 0.6, 0.8)) + described = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8)) assert described.schema == { "describe": pl.Utf8, "numerical": pl.Float64, diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py index 25f3aeff6007..623a011ae202 100644 --- a/py-polars/tests/unit/utils/test_utils.py +++ b/py-polars/tests/unit/utils/test_utils.py @@ -121,21 +121,28 @@ def test_in_notebook() -> None: @pytest.mark.parametrize( - ("percentiles", "expected"), + ("percentiles", "expected", "inject_median"), [ - (None, [0.5]), - (0.2, [0.2, 0.5]), - (0.5, [0.5]), - ((0.25, 0.75), [0.25, 0.5, 0.75]), + (None, [0.5], True), + (0.2, [0.2, 0.5], True), + (0.5, [0.5], True), + ((0.25, 0.75), [0.25, 0.5, 0.75], True), # Undocumented effect - percentiles get sorted. # Can be changed, this serves as documentation of current behaviour. - ((0.6, 0.3), [0.3, 0.5, 0.6]), + ((0.6, 0.3), [0.3, 0.5, 0.6], True), + (None, [], False), + (0.2, [0.2], False), + (0.5, [0.5], False), + ((0.25, 0.75), [0.25, 0.75], False), + ((0.6, 0.3), [0.3, 0.6], False), ], ) def test_parse_percentiles( - percentiles: Sequence[float] | float | None, expected: Sequence[float] + percentiles: Sequence[float] | float | None, + expected: Sequence[float], + inject_median: bool, ) -> None: - assert parse_percentiles(percentiles) == expected + assert parse_percentiles(percentiles, inject_median=inject_median) == expected @pytest.mark.parametrize(("percentiles"), [(1.1), ([-0.1])])