Skip to content

Commit

Permalink
fix: Parquet prefiltered with projection pushdown (#18714)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Sep 12, 2024
1 parent ca383a0 commit 0e5e554
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 0 deletions.
10 changes: 10 additions & 0 deletions crates/polars-io/src/parquet/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,21 @@ fn rg_to_dfs_prefiltered(
let num_live_columns = live_variables.len();
let num_dead_columns = projection.len() - num_live_columns;

// @NOTE: This is probably already sorted, but just to be sure.
let mut projection_sorted = projection.to_vec();
projection_sorted.sort();

// We create two look-up tables that map indexes offsets into the live- and dead-set onto
// column indexes of the schema.
let mut live_idx_to_col_idx = Vec::with_capacity(num_live_columns);
let mut dead_idx_to_col_idx = Vec::with_capacity(num_dead_columns);
let mut offset = 0;
for (i, field) in schema.iter_values().enumerate() {
if projection_sorted.get(offset).copied() != Some(i) {
continue;
}

offset += 1;
if live_variables.contains(&field.name[..]) {
live_idx_to_col_idx.push(i);
} else {
Expand Down
13 changes: 13 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1904,3 +1904,16 @@ def test_write_binary_open_file(tmp_path: Path) -> None:

out = pl.read_parquet(path)
assert_frame_equal(out, df)


def test_prefilter_with_projection() -> None:
f = io.BytesIO()
pl.DataFrame({"a": [1], "b": [2]}).write_parquet(f)

f.seek(0)
(
pl.scan_parquet(f, parallel="prefiltered")
.filter(pl.col.a == 1)
.select(pl.col.a)
.collect()
)

0 comments on commit 0e5e554

Please sign in to comment.