docs(python): clarify that median is equivalent to the 50% percentile…

… shown in `describe` metrics (#11694)
pola-rs · Oct 13, 2023 · 8bb0b93 · 8bb0b93
1 parent a94ea67
commit 8bb0b93
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 14 deletions.
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -4079,7 +4079,7 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
         return None
 
     def describe(
-        self, percentiles: Sequence[float] | float | None = (0.25, 0.75)
+        self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
     ) -> Self:
         """
         Summary statistics for a DataFrame.
@@ -4090,6 +4090,10 @@ def describe(
             One or more percentiles to include in the summary statistics.
             All values must be in the range `[0, 1]`.
 
+        Notes
+        -----
+        The median is included by default as the 50% percentile.
+
         See Also
         --------
         glimpse

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -1501,7 +1501,7 @@ def to_frame(self, name: str | None = None) -> DataFrame:
         return wrap_df(PyDataFrame([self._s]))
 
     def describe(
-        self, percentiles: Sequence[float] | float | None = (0.25, 0.75)
+        self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
     ) -> DataFrame:
         """
         Quick summary statistics of a series.
@@ -1515,6 +1515,10 @@ def describe(
             One or more percentiles to include in the summary statistics (if the
             series has a numeric dtype). All values must be in the range `[0, 1]`.
 
+        Notes
+        -----
+        The median is included by default as the 50% percentile.
+
         Returns
         -------
         DataFrame

diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py
@@ -473,7 +473,9 @@ def in_terminal_that_supports_colour() -> bool:
     return False
 
 
-def parse_percentiles(percentiles: Sequence[float] | float | None) -> Sequence[float]:
+def parse_percentiles(
+    percentiles: Sequence[float] | float | None, *, inject_median: bool = False
+) -> Sequence[float]:
     """
     Transforms raw percentiles into our preferred format, adding the 50th percentile.
 
@@ -490,7 +492,9 @@ def parse_percentiles(percentiles: Sequence[float] | float | None) -> Sequence[f
     sub_50_percentiles = sorted(p for p in percentiles if p < 0.5)
     at_or_above_50_percentiles = sorted(p for p in percentiles if p >= 0.5)
 
-    if not at_or_above_50_percentiles or at_or_above_50_percentiles[0] != 0.5:
+    if inject_median and (
+        not at_or_above_50_percentiles or at_or_above_50_percentiles[0] != 0.5
+    ):
         at_or_above_50_percentiles = [0.5, *at_or_above_50_percentiles]
 
     return [*sub_50_percentiles, *at_or_above_50_percentiles]
diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
@@ -1008,11 +1008,10 @@ def test_describe() -> None:
             ("mean", 1.3333333333333333, None, None),
             ("std", 0.5773502691896257, None, None),
             ("min", 1.0, None, None),
-            ("50%", 1.0, None, None),
             ("max", 2.0, None, None),
         ]
 
-    described = df.describe(percentiles=(0.2, 0.4, 0.6, 0.8))
+    described = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))
     assert described.schema == {
         "describe": pl.Utf8,
         "numerical": pl.Float64,

diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py
@@ -121,21 +121,28 @@ def test_in_notebook() -> None:
 
 
 @pytest.mark.parametrize(
-    ("percentiles", "expected"),
+    ("percentiles", "expected", "inject_median"),
     [
-        (None, [0.5]),
-        (0.2, [0.2, 0.5]),
-        (0.5, [0.5]),
-        ((0.25, 0.75), [0.25, 0.5, 0.75]),
+        (None, [0.5], True),
+        (0.2, [0.2, 0.5], True),
+        (0.5, [0.5], True),
+        ((0.25, 0.75), [0.25, 0.5, 0.75], True),
         # Undocumented effect - percentiles get sorted.
         # Can be changed, this serves as documentation of current behaviour.
-        ((0.6, 0.3), [0.3, 0.5, 0.6]),
+        ((0.6, 0.3), [0.3, 0.5, 0.6], True),
+        (None, [], False),
+        (0.2, [0.2], False),
+        (0.5, [0.5], False),
+        ((0.25, 0.75), [0.25, 0.75], False),
+        ((0.6, 0.3), [0.3, 0.6], False),
     ],
 )
 def test_parse_percentiles(
-    percentiles: Sequence[float] | float | None, expected: Sequence[float]
+    percentiles: Sequence[float] | float | None,
+    expected: Sequence[float],
+    inject_median: bool,
 ) -> None:
-    assert parse_percentiles(percentiles) == expected
+    assert parse_percentiles(percentiles, inject_median=inject_median) == expected
 
 
 @pytest.mark.parametrize(("percentiles"), [(1.1), ([-0.1])])