Skip to content

Commit

Permalink
Use pandas for more nicely format numeric summary stats
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Apr 22, 2024
1 parent ea87358 commit 4594676
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -905,20 +905,22 @@ def _prof_summary_stats(self, column_index: int):

@staticmethod
def _summarize_number(col: "pd.Series"):
min_value = col.min()
max_value = col.max()
mean = col.mean()
median = col.median()
stdev = col.std()
import pandas.io.formats.format as fmt

minmax = pd_.Series([col.min(), col.max()], dtype=col.dtype)
numeric_stats = pd_.Series([col.mean(), col.median(), col.std()])

min_value, max_value = fmt.format_array(minmax.to_numpy(), None, leading_space=False)
mean, median, stdev = fmt.format_array(numeric_stats.to_numpy(), None, leading_space=False)

return ColumnSummaryStats(
type_display=ColumnDisplayType.Number,
number_stats=SummaryStatsNumber(
min_value=str(min_value),
max_value=str(max_value),
mean=str(mean),
median=str(median),
stdev=str(stdev),
min_value=min_value,
max_value=max_value,
mean=mean,
median=median,
stdev=stdev,
),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1380,7 +1380,7 @@ def _assert_close(expected, actual):

def _assert_numeric_stats_equal(expected, actual):
for attr, value in expected.items():
_assert_close(value, float(actual.get(attr)))
_assert_close(float(value), float(actual.get(attr)))


def _assert_string_stats_equal(expected, actual):
Expand All @@ -1394,6 +1394,8 @@ def _assert_boolean_stats_equal(expected, actual):


def test_pandas_profile_summary_stats(dxf: DataExplorerFixture):
import pandas.io.formats.format as fmt

arr = np.random.standard_normal(100)
arr_with_nulls = arr.copy()
arr_with_nulls[::10] = np.nan
Expand All @@ -1420,27 +1422,30 @@ def test_pandas_profile_summary_stats(dxf: DataExplorerFixture):
)
dxf.register_table("df1", df1)

def _format_float(x):
return fmt.format_array(np.array([x], dtype="float64"), None, leading_space=False)[0]

cases = [
(
"df1",
0,
{
"min_value": arr.min(),
"max_value": arr.max(),
"mean": df1["a"].mean(),
"stdev": df1["a"].std(),
"median": df1["a"].median(),
"min_value": _format_float(arr.min()),
"max_value": _format_float(arr.max()),
"mean": _format_float(df1["a"].mean()),
"stdev": _format_float(df1["a"].std()),
"median": _format_float(df1["a"].median()),
},
),
(
"df1",
1,
{
"min_value": df1["b"].min(),
"max_value": df1["b"].max(),
"mean": df1["b"].mean(),
"stdev": df1["b"].std(),
"median": df1["b"].median(),
"min_value": _format_float(df1["b"].min()),
"max_value": _format_float(df1["b"].max()),
"mean": _format_float(df1["b"].mean()),
"stdev": _format_float(df1["b"].std()),
"median": _format_float(df1["b"].median()),
},
),
(
Expand Down

0 comments on commit 4594676

Please sign in to comment.