Skip to content

Commit

Permalink
fix: Incorrect filter on categorical columns from parquet files (#17950)
Browse files Browse the repository at this point in the history
  • Loading branch information
nameexhaustion authored Jul 31, 2024
1 parent f38b6aa commit 4275808
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
3 changes: 2 additions & 1 deletion crates/polars-io/src/predicates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ impl ColumnStats {

/// Returns whether the [`DataType`] supports minimum/maximum operations.
fn use_min_max(dtype: &DataType) -> bool {
dtype.to_physical().is_numeric()
dtype.is_numeric()
|| dtype.is_temporal()
|| matches!(
dtype,
DataType::String | DataType::Binary | DataType::Boolean
Expand Down
25 changes: 25 additions & 0 deletions py-polars/tests/unit/io/test_lazy_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,3 +450,28 @@ def test_parquet_schema_mismatch_panic_17067(tmp_path: Path, streaming: bool) ->

with pytest.raises(pl.exceptions.SchemaError):
pl.scan_parquet(tmp_path).collect(streaming=streaming)


@pytest.mark.write_disk()
def test_predicate_push_down_categorical_17744(tmp_path: Path) -> None:
path = tmp_path / "1"

df = pl.DataFrame(
data={
"n": [1, 2, 3],
"ccy": ["USD", "JPY", "EUR"],
},
schema_overrides={"ccy": pl.Categorical("lexical")},
)
df.write_parquet(path)
expect = df.head(1).with_columns(pl.col(pl.Categorical).cast(pl.String))

lf = pl.scan_parquet(path)

for predicate in [pl.col("ccy") == "USD", pl.col("ccy").is_in(["USD"])]:
assert_frame_equal(
lf.filter(predicate)
.with_columns(pl.col(pl.Categorical).cast(pl.String))
.collect(),
expect,
)

0 comments on commit 4275808

Please sign in to comment.