Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix more global categorical issues #20547

Merged
merged 3 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ jobs:
const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown';
const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown';

let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`;
let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

driveby


// Calculate percentage increase if both sizes are available
if (previousSize !== 'Unknown' && currentSize !== '') {
Expand Down
30 changes: 23 additions & 7 deletions crates/polars-core/src/chunked_array/comparison/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,13 +374,29 @@ where
// Apply comparison on categories map and then do a lookup
let bitmap = str_single_compare_function(lhs.get_rev_map().get_categories(), rhs);

Ok(
BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(|opt_idx| {
// SAFETY: indexing into bitmap with same length as original array
opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) })
}))
.with_name(lhs.name().clone()),
)
let mask = match lhs.get_rev_map().as_ref() {
RevMapping::Local(_, _) => {
BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(
|opt_idx| {
// SAFETY: indexing into bitmap with same length as original array
opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) })
},
))
},
RevMapping::Global(idx_map, _, _) => {
BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(
|opt_idx| {
// SAFETY: indexing into bitmap with same length as original array
opt_idx.map(|idx| unsafe {
let idx = *idx_map.get(&idx).unwrap();
bitmap.get_bit_unchecked(idx as usize)
})
},
))
},
};

Ok(mask.with_name(lhs.name().clone()))
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel};

use super::*;

impl CategoricalChunked {
Expand Down Expand Up @@ -41,18 +39,7 @@ impl CategoricalChunked {
Ok(out)
}
} else {
let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
for chunk in self.physical().downcast_iter() {
state.key_state().append(chunk);
}
let (_, unique, _) = state.finalize_unique().take();
let ca = unsafe {
UInt32Chunked::from_chunks_and_dtype_unchecked(
self.physical().name().clone(),
vec![unique.to_boxed()],
DataType::UInt32,
)
};
let ca = self.physical().unique()?;
// SAFETY:
// we only removed some indexes so we are still in bounds
unsafe {
Expand All @@ -70,12 +57,7 @@ impl CategoricalChunked {
if self._can_fast_unique() {
Ok(self.get_rev_map().len())
} else {
let cat_map = self.get_rev_map();
let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
for chunk in self.physical().downcast_iter() {
state.key_state().append(chunk);
}
Ok(state.finalize_n_unique())
self.physical().n_unique()
}
}

Expand Down
21 changes: 20 additions & 1 deletion py-polars/tests/unit/datatypes/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,7 @@ def test_perfect_group_by_19950() -> None:
def test_categorical_unique() -> None:
s = pl.Series(["a", "b", None], dtype=pl.Categorical)
assert s.n_unique() == 3
assert s.unique().to_list() == ["a", "b", None]
assert s.unique().sort().to_list() == [None, "a", "b"]


@StringCache()
Expand All @@ -925,3 +925,22 @@ def test_categorical_unique_20539() -> None:
"unique": [["a", "b"], ["b", "c"], ["c"]],
"unique_with_order": [["a", "b"], ["b", "c"], ["c"]],
}


@StringCache()
@pytest.mark.may_fail_auto_streaming
def test_categorical_prefill() -> None:
# https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443
# prefill cache
pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical) # pre-fill cache

# test_compare_categorical_single
assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False]

# test_unique_categorical
a = pl.Series(["a"], dtype=pl.Categorical)
assert a.unique().to_list() == ["a"]

s = pl.Series(["1", "2", "3"], dtype=pl.Categorical)
s = s.filter([True, False, True])
assert s.n_unique() == 2
Loading