Skip to content

Commit

Permalink
Refine comments on rows_scanned vs rows_read, and add check if row_gr…
Browse files Browse the repository at this point in the history
…oup_start gt zero when initializing rows_scanned
  • Loading branch information
brifitz committed Jan 1, 2025
1 parent 92d59cc commit ca87683
Showing 1 changed file with 15 additions and 5 deletions.
20 changes: 15 additions & 5 deletions crates/polars-io/src/parquet/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -689,11 +689,21 @@ fn rg_to_dfs_par_over_rg(
.sum();
let slice_end = slice.0 + slice.1;

// we distinguish between the number of rows scanned and the number of rows actually
// read as these values can differ when the slice pushdown optimization is used
let mut rows_scanned: IdxSize = (0..row_group_start)
.map(|i| file_metadata.row_groups[i].num_rows() as IdxSize)
.sum();
// rows_scanned is the number of rows that have been scanned so far when checking for overlap with the slice.
// rows_read is the number of rows found to overlap with the slice, and thus the number of rows that will be
// read into a dataframe.
let mut rows_scanned: IdxSize;

if row_group_start > 0 {
// In the case of async reads, we need to account for the fact that row_group_start may be greater than
// zero due to earlier processing.
// For details, see: https://github.com/pola-rs/polars/pull/20508#discussion_r1900165649
rows_scanned = (0..row_group_start)
.map(|i| file_metadata.row_groups[i].num_rows() as IdxSize)
.sum();
} else {
rows_scanned = 0;
}

for i in row_group_start..row_group_end {
let row_count_start = rows_scanned;
Expand Down

0 comments on commit ca87683

Please sign in to comment.