Skip to content

Commit

Permalink
docs(python): clarify that median is equivalent to the 50% percentile…
Browse files Browse the repository at this point in the history
… shown in `describe` metrics (#11694)
  • Loading branch information
alexander-beedie authored Oct 13, 2023
1 parent a94ea67 commit 8bb0b93
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 14 deletions.
6 changes: 5 additions & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4079,7 +4079,7 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
return None

def describe(
self, percentiles: Sequence[float] | float | None = (0.25, 0.75)
self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
) -> Self:
"""
Summary statistics for a DataFrame.
Expand All @@ -4090,6 +4090,10 @@ def describe(
One or more percentiles to include in the summary statistics.
All values must be in the range `[0, 1]`.
Notes
-----
The median is included by default as the 50% percentile.
See Also
--------
glimpse
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1501,7 +1501,7 @@ def to_frame(self, name: str | None = None) -> DataFrame:
return wrap_df(PyDataFrame([self._s]))

def describe(
self, percentiles: Sequence[float] | float | None = (0.25, 0.75)
self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
) -> DataFrame:
"""
Quick summary statistics of a series.
Expand All @@ -1515,6 +1515,10 @@ def describe(
One or more percentiles to include in the summary statistics (if the
series has a numeric dtype). All values must be in the range `[0, 1]`.
Notes
-----
The median is included by default as the 50% percentile.
Returns
-------
DataFrame
Expand Down
8 changes: 6 additions & 2 deletions py-polars/polars/utils/various.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,9 @@ def in_terminal_that_supports_colour() -> bool:
return False


def parse_percentiles(percentiles: Sequence[float] | float | None) -> Sequence[float]:
def parse_percentiles(
percentiles: Sequence[float] | float | None, *, inject_median: bool = False
) -> Sequence[float]:
"""
Transforms raw percentiles into our preferred format, adding the 50th percentile.
Expand All @@ -490,7 +492,9 @@ def parse_percentiles(percentiles: Sequence[float] | float | None) -> Sequence[f
sub_50_percentiles = sorted(p for p in percentiles if p < 0.5)
at_or_above_50_percentiles = sorted(p for p in percentiles if p >= 0.5)

if not at_or_above_50_percentiles or at_or_above_50_percentiles[0] != 0.5:
if inject_median and (
not at_or_above_50_percentiles or at_or_above_50_percentiles[0] != 0.5
):
at_or_above_50_percentiles = [0.5, *at_or_above_50_percentiles]

return [*sub_50_percentiles, *at_or_above_50_percentiles]
3 changes: 1 addition & 2 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,11 +1008,10 @@ def test_describe() -> None:
("mean", 1.3333333333333333, None, None),
("std", 0.5773502691896257, None, None),
("min", 1.0, None, None),
("50%", 1.0, None, None),
("max", 2.0, None, None),
]

described = df.describe(percentiles=(0.2, 0.4, 0.6, 0.8))
described = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))
assert described.schema == {
"describe": pl.Utf8,
"numerical": pl.Float64,
Expand Down
23 changes: 15 additions & 8 deletions py-polars/tests/unit/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,28 @@ def test_in_notebook() -> None:


@pytest.mark.parametrize(
("percentiles", "expected"),
("percentiles", "expected", "inject_median"),
[
(None, [0.5]),
(0.2, [0.2, 0.5]),
(0.5, [0.5]),
((0.25, 0.75), [0.25, 0.5, 0.75]),
(None, [0.5], True),
(0.2, [0.2, 0.5], True),
(0.5, [0.5], True),
((0.25, 0.75), [0.25, 0.5, 0.75], True),
# Undocumented effect - percentiles get sorted.
# Can be changed, this serves as documentation of current behaviour.
((0.6, 0.3), [0.3, 0.5, 0.6]),
((0.6, 0.3), [0.3, 0.5, 0.6], True),
(None, [], False),
(0.2, [0.2], False),
(0.5, [0.5], False),
((0.25, 0.75), [0.25, 0.75], False),
((0.6, 0.3), [0.3, 0.6], False),
],
)
def test_parse_percentiles(
percentiles: Sequence[float] | float | None, expected: Sequence[float]
percentiles: Sequence[float] | float | None,
expected: Sequence[float],
inject_median: bool,
) -> None:
assert parse_percentiles(percentiles) == expected
assert parse_percentiles(percentiles, inject_median=inject_median) == expected


@pytest.mark.parametrize(("percentiles"), [(1.1), ([-0.1])])
Expand Down

0 comments on commit 8bb0b93

Please sign in to comment.