From 5a0c803d1560eef54c1435843bb491fad17aac0d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 6 Jun 2024 12:46:09 +0200 Subject: [PATCH] fix: Fix boolean distinct (#16765) --- crates/polars-compute/src/distinct_count.rs | 10 ++++++++-- crates/polars-parquet/src/arrow/write/boolean/basic.rs | 6 +++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/crates/polars-compute/src/distinct_count.rs b/crates/polars-compute/src/distinct_count.rs index 2587766706a3..9d621eaa3ea5 100644 --- a/crates/polars-compute/src/distinct_count.rs +++ b/crates/polars-compute/src/distinct_count.rs @@ -12,7 +12,13 @@ impl DistinctCountKernel for BooleanArray { return 0; } - let unset_bits = self.values().unset_bits(); - 2 - usize::from(unset_bits == 0 || unset_bits == self.values().len()) + if self.null_count() == 0 { + let unset_bits = self.values().unset_bits(); + 2 - usize::from(unset_bits == 0 || unset_bits == self.values().len()) + } else { + let values = self.values() & self.validity().unwrap(); + let unset_bits = self.values().unset_bits(); + 3 - usize::from(unset_bits == 0 || unset_bits == values.len()) + } } } diff --git a/crates/polars-parquet/src/arrow/write/boolean/basic.rs b/crates/polars-parquet/src/arrow/write/boolean/basic.rs index c51880ede2d6..f4ba33d785c7 100644 --- a/crates/polars-parquet/src/arrow/write/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/basic.rs @@ -91,7 +91,11 @@ pub(super) fn build_statistics( null_count: options.null_count.then(|| array.null_count() as i64), distinct_count: options .distinct_count - .then(|| array.distinct_count().try_into().ok()) + .then(|| { + (array.distinct_count() - ((array.null_count() > 0) as usize)) + .try_into() + .ok() + }) .flatten(), max_value: options .max_value