Skip to content

Commit

Permalink
Merge branch 'main' into user-guide-ml-update
Browse files Browse the repository at this point in the history
  • Loading branch information
Liam Brannigan authored and Liam Brannigan committed Jan 7, 2025
2 parents 26a50cc + 72cd66a commit dbe3334
Show file tree
Hide file tree
Showing 144 changed files with 660 additions and 427 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/benchmark-remote.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ jobs:

- name: Clone Polars-benchmark
run: |
git clone --depth=1 https://github.com/pola-rs/polars-benchmark.git
git clone --depth=1 https://github.com/pola-rs/polars-benchmark.git
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
python-version: '3.13'

- name: Create virtual environment
run: |
Expand All @@ -46,7 +46,7 @@ jobs:
# Install typing-extensions separately whilst the `--extra-index-url` in `requirements-ci.txt`
# doesn't have an up-to-date typing-extensions, see
# https://github.com/astral-sh/uv/issues/6028#issuecomment-2287232150
uv pip install -U typing-extensions
uv pip install -U typing-extensions
uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt --verbose
- name: Install Polars-Benchmark dependencies
Expand All @@ -68,7 +68,7 @@ jobs:
working-directory: polars-benchmark
run: |
"$HOME/py-polars-cache/run-benchmarks.sh" | tee ../py-polars/benchmark-results
- name: Cache the Polars build
if: ${{ github.ref == 'refs/heads/main' }}
working-directory: py-polars
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
python-version: '3.13'

- name: Set up Graphviz
uses: ts-graphviz/setup-graphviz@v2
Expand All @@ -52,8 +52,8 @@ jobs:
# Install typing-extensions separately whilst the `--extra-index-url` in `requirements-ci.txt`
# doesn't have an up-to-date typing-extensions, see
# https://github.com/astral-sh/uv/issues/6028#issuecomment-2287232150
uv pip install -U typing-extensions
uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt --verbose
uv pip install -U typing-extensions
uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt --verbose --index-strategy=unsafe-best-match
- name: Set up Rust
run: rustup show
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
python-version: '3.13'

- name: Create virtual environment
run: |
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/lint-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.9', '3.12']
python-version: ['3.9', '3.13']

steps:
- uses: actions/checkout@v4
Expand All @@ -58,7 +58,9 @@ jobs:
- name: Install Python dependencies
working-directory: py-polars
run: uv pip install -r requirements-dev.txt -r requirements-lint.txt
# TODO: Fix typing issues for newer NumPy versions
# https://github.com/pola-rs/polars/issues/20561
run: uv pip install -r requirements-dev.txt -r requirements-lint.txt 'numpy<2.1'

# Allow untyped calls for older Python versions
- name: Run mypy
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/test-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ['3.9', '3.12']
python-version: ['3.9', '3.12', '3.13']
include:
- os: windows-latest
python-version: '3.12'
python-version: '3.13'

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -70,7 +70,7 @@ jobs:
# doesn't have an up-to-date typing-extensions, see
# https://github.com/astral-sh/uv/issues/6028#issuecomment-2287232150
uv pip install -U typing-extensions
uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt --verbose
uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt --verbose --index-strategy=unsafe-best-match
- name: Set up Rust
run: rustup show
Expand All @@ -85,7 +85,7 @@ jobs:
run: maturin develop

- name: Run doctests
if: github.ref_name != 'main' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
if: github.ref_name != 'main' && matrix.python-version == '3.13' && matrix.os == 'ubuntu-latest'
run: |
python tests/docs/run_doctest.py
pytest tests/docs/test_user_guide.py -m docs
Expand All @@ -107,7 +107,7 @@ jobs:
run: pytest -m "not release and not benchmark and not docs" tests/unit/io/

- name: Check import without optional dependencies
if: github.ref_name != 'main' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
if: github.ref_name != 'main' && matrix.python-version == '3.13' && matrix.os == 'ubuntu-latest'
run: |
declare -a deps=("pandas"
"pyarrow"
Expand Down
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ serde_json = "1"
simd-json = { version = "0.14", features = ["known-key"] }
simdutf8 = "0.1.4"
slotmap = "1"
sqlparser = "0.52"
sqlparser = "0.53"
stacker = "0.1"
streaming-iterator = "0.1.9"
strength_reduce = "0.2"
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ use polars_error::{polars_bail, PolarsResult};
///
/// # Safety
/// The following invariants hold:
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
/// * `len` is equal to `validity.len()`, when defined.
#[derive(Clone)]
pub struct BinaryArray<O: Offset> {
Expand Down
11 changes: 10 additions & 1 deletion crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
type Owned: Debug + Clone + Sync + Send + AsRef<Self>;

/// # Safety
/// The caller must ensure `index < self.len()`.
/// The caller must ensure that `slice` is a valid view.
unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self;
fn from_bytes(slice: &[u8]) -> Option<&Self>;

fn to_bytes(&self) -> &[u8];

Expand All @@ -70,6 +71,10 @@ impl ViewType for str {
unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
std::str::from_utf8_unchecked(slice)
}
#[inline(always)]
fn from_bytes(slice: &[u8]) -> Option<&Self> {
std::str::from_utf8(slice).ok()
}

#[inline(always)]
fn to_bytes(&self) -> &[u8] {
Expand All @@ -93,6 +98,10 @@ impl ViewType for [u8] {
unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {
slice
}
#[inline(always)]
fn from_bytes(slice: &[u8]) -> Option<&Self> {
Some(slice)
}

#[inline(always)]
fn to_bytes(&self) -> &[u8] {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + H
/// Represents this key as a `usize`.
///
/// # Safety
/// The caller _must_ have checked that the value can be casted to `usize`.
/// The caller _must_ have checked that the value can be cast to `usize`.
#[inline]
unsafe fn as_usize(self) -> usize {
match self.try_into() {
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
//! * [`ListArray`] and [`MutableListArray`], an array of arrays (e.g. `[[1, 2], None, [], [None]]`)
//! * [`StructArray`] and [`MutableStructArray`], an array of arrays identified by a string (e.g. `{"a": [1, 2], "b": [true, false]}`)
//!
//! All immutable arrays implement the trait object [`Array`] and that can be downcasted
//! All immutable arrays implement the trait object [`Array`] and that can be downcast
//! to a concrete struct based on [`PhysicalType`](crate::datatypes::PhysicalType) available from [`Array::dtype`].
//! All immutable arrays are backed by [`Buffer`](crate::buffer::Buffer) and thus cloning and slicing them is `O(1)`.
//!
Expand Down Expand Up @@ -58,7 +58,7 @@ pub trait Splitable: Sized {
}

/// A trait representing an immutable Arrow array. Arrow arrays are trait objects
/// that are infallibly downcasted to concrete types according to the [`Array::dtype`].
/// that are infallibly downcast to concrete types according to the [`Array::dtype`].
pub trait Array: Send + Sync + dyn_clone::DynClone + 'static {
/// Converts itself to a reference of [`Any`], which enables downcasting to concrete types.
fn as_any(&self) -> &dyn Any;
Expand Down
18 changes: 9 additions & 9 deletions crates/polars-arrow/src/array/union/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,14 @@ impl UnionArray {

/// Creates a new null [`UnionArray`].
pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
if let ArrowDataType::Union(f, _, mode) = &dtype {
let fields = f
if let ArrowDataType::Union(u) = &dtype {
let fields = u
.fields
.iter()
.map(|x| new_null_array(x.dtype().clone(), length))
.collect();

let offsets = if mode.is_sparse() {
let offsets = if u.mode.is_sparse() {
None
} else {
Some((0..length as i32).collect::<Vec<_>>().into())
Expand All @@ -194,13 +195,14 @@ impl UnionArray {

/// Creates a new empty [`UnionArray`].
pub fn new_empty(dtype: ArrowDataType) -> Self {
if let ArrowDataType::Union(f, _, mode) = dtype.to_logical_type() {
let fields = f
if let ArrowDataType::Union(u) = dtype.to_logical_type() {
let fields = u
.fields
.iter()
.map(|x| new_empty_array(x.dtype().clone()))
.collect();

let offsets = if mode.is_sparse() {
let offsets = if u.mode.is_sparse() {
None
} else {
Some(Buffer::default())
Expand Down Expand Up @@ -351,9 +353,7 @@ impl Array for UnionArray {
impl UnionArray {
fn try_get_all(dtype: &ArrowDataType) -> PolarsResult<UnionComponents> {
match dtype.to_logical_type() {
ArrowDataType::Union(fields, ids, mode) => {
Ok((fields, ids.as_ref().map(|x| x.as_ref()), *mode))
},
ArrowDataType::Union(u) => Ok((&u.fields, u.ids.as_ref().map(|x| x.as_ref()), u.mode)),
_ => polars_bail!(ComputeError:
"The UnionArray requires a logical type of DataType::Union",
),
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
///
/// # Safety
/// The following invariants hold:
/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
/// * A slice of `values` taken from two consecutives `offsets` is valid `utf8`.
/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
/// * A slice of `values` taken from two consecutive `offsets` is valid `utf8`.
/// * `len` is equal to `validity.len()`, when defined.
#[derive(Clone)]
pub struct Utf8Array<O: Offset> {
Expand Down
43 changes: 27 additions & 16 deletions crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,7 @@ pub enum ArrowDataType {
/// Decimal backed by 256 bits
Decimal256(usize, usize),
/// Extension type.
/// - name
/// - physical type
/// - metadata
Extension(PlSmallStr, Box<ArrowDataType>, Option<PlSmallStr>),
Extension(Box<ExtensionType>),
/// A binary type that inlines small values
/// and can intern bytes.
BinaryView,
Expand All @@ -175,7 +172,22 @@ pub enum ArrowDataType {
/// A nested datatype that can represent slots of differing types.
/// Third argument represents mode
#[cfg_attr(feature = "serde", serde(skip))]
Union(Vec<Field>, Option<Vec<i32>>, UnionMode),
Union(Box<UnionType>),
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct ExtensionType {
pub name: PlSmallStr,
pub inner: ArrowDataType,
pub metadata: Option<PlSmallStr>,
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct UnionType {
pub fields: Vec<Field>,
pub ids: Option<Vec<i32>>,
pub mode: UnionMode,
}

/// Mode of [`ArrowDataType::Union`]
Expand Down Expand Up @@ -277,10 +289,10 @@ impl ArrowDataType {
FixedSizeList(_, _) => PhysicalType::FixedSizeList,
LargeList(_) => PhysicalType::LargeList,
Struct(_) => PhysicalType::Struct,
Union(_, _, _) => PhysicalType::Union,
Union(_) => PhysicalType::Union,
Map(_, _) => PhysicalType::Map,
Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
Extension(_, key, _) => key.to_physical_type(),
Extension(ext) => ext.inner.to_physical_type(),
Unknown => unimplemented!(),
}
}
Expand Down Expand Up @@ -322,9 +334,9 @@ impl ArrowDataType {
.collect(),
),
Dictionary(keys, _, _) => (*keys).into(),
Union(_, _, _) => unimplemented!(),
Union(_) => unimplemented!(),
Map(_, _) => unimplemented!(),
Extension(_, inner, _) => inner.underlying_physical_type(),
Extension(ext) => ext.inner.underlying_physical_type(),
_ => self.clone(),
}
}
Expand All @@ -335,7 +347,7 @@ impl ArrowDataType {
pub fn to_logical_type(&self) -> &ArrowDataType {
use ArrowDataType::*;
match self {
Extension(_, key, _) => key.to_logical_type(),
Extension(ext) => ext.inner.to_logical_type(),
_ => self,
}
}
Expand All @@ -358,10 +370,10 @@ impl ArrowDataType {
| D::LargeList(_)
| D::FixedSizeList(_, _)
| D::Struct(_)
| D::Union(_, _, _)
| D::Union(_)
| D::Map(_, _)
| D::Dictionary(_, _, _)
| D::Extension(_, _, _)
| D::Extension(_)
)
}

Expand Down Expand Up @@ -439,11 +451,10 @@ impl ArrowDataType {
| D::FixedSizeList(field, _)
| D::Map(field, _)
| D::LargeList(field) => field.dtype().contains_dictionary(),
D::Struct(fields) | D::Union(fields, _, _) => {
fields.iter().any(|f| f.dtype().contains_dictionary())
},
D::Struct(fields) => fields.iter().any(|f| f.dtype().contains_dictionary()),
D::Union(union) => union.fields.iter().any(|f| f.dtype().contains_dictionary()),
D::Dictionary(_, _, _) => true,
D::Extension(_, dtype, _) => dtype.contains_dictionary(),
D::Extension(ext) => ext.inner.contains_dictionary(),
}
}
}
Expand Down
Loading

0 comments on commit dbe3334

Please sign in to comment.