Skip to content

Commit

Permalink
Merge branch 'apache:main' into avro-codec
Browse files Browse the repository at this point in the history
  • Loading branch information
jecsand838 authored Jan 13, 2025
2 parents 03bd5f0 + 3b31ff6 commit eeabc49
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 161 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/arrow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,11 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
target: wasm32-unknown-unknown,wasm32-wasi
target: wasm32-unknown-unknown,wasm32-wasip1
- name: Build wasm32-unknown-unknown
run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-unknown-unknown
- name: Build wasm32-wasi
run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-wasi
- name: Build wasm32-wasip1
run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-wasip1

clippy:
name: Clippy
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/object_store.yml
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,11 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
target: wasm32-unknown-unknown,wasm32-wasi
target: wasm32-unknown-unknown,wasm32-wasip1
- name: Build wasm32-unknown-unknown
run: cargo build --target wasm32-unknown-unknown
- name: Build wasm32-wasi
run: cargo build --target wasm32-wasi
- name: Build wasm32-wasip1
run: cargo build --target wasm32-wasip1

windows:
name: cargo test LocalFileSystem (win64)
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/parquet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
target: wasm32-unknown-unknown,wasm32-wasi
target: wasm32-unknown-unknown,wasm32-wasip1
- name: Install clang # Needed for zlib compilation
run: apt-get update && apt-get install -y clang gcc-multilib
- name: Build wasm32-unknown-unknown
run: cargo build -p parquet --target wasm32-unknown-unknown
- name: Build wasm32-wasi
run: cargo build -p parquet --target wasm32-wasi
- name: Build wasm32-wasip1
run: cargo build -p parquet --target wasm32-wasip1

pyspark-integration-test:
name: PySpark Integration Test
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,16 @@ Planned Release Schedule
| Dec 2024 | `0.11.2` | Minor, NO breaking API changes |
| Feb 2025 | `0.12.0` | Major, potentially breaking API changes |

### Guidelines for `panic` vs `Result`

In general, use panics for bad states that are unreachable, unrecoverable or harmful.
For those caused by invalid user input, however, we prefer to report that invalidity
gracefully as an error result instead of panicking. In general, invalid input should result
in an `Error` as soon as possible. It _is_ ok for code paths after validation to assume
validation has already occurred and panic if not. See [this ticket] for more nuances.

[this ticket]: https://github.com/apache/arrow-rs/issues/6737

### Deprecation Guidelines

Minor releases may deprecate, but not remove APIs. Deprecating APIs allows
Expand Down
134 changes: 100 additions & 34 deletions arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,12 @@ impl ArrayData {
offset,
buffers,
child_data,
align_buffers: false,
// SAFETY: caller responsible for ensuring data is valid
skip_validation: true,
}
.build_unchecked()
.build()
.unwrap()
}

/// Create a new ArrayData, validating that the provided buffers form a valid
Expand Down Expand Up @@ -1775,7 +1779,7 @@ impl PartialEq for ArrayData {
}
}

/// Builder for `ArrayData` type
/// Builder for [`ArrayData`] type
#[derive(Debug)]
pub struct ArrayDataBuilder {
data_type: DataType,
Expand All @@ -1786,6 +1790,20 @@ pub struct ArrayDataBuilder {
offset: usize,
buffers: Vec<Buffer>,
child_data: Vec<ArrayData>,
/// Should buffers be realigned (copying if necessary)?
///
/// Defaults to false.
align_buffers: bool,
/// Should data validation be skipped for this [`ArrayData`]?
///
/// Defaults to false.
///
/// # Safety
///
/// This flag can only be set to true using `unsafe` APIs. However, once true
/// subsequent calls to `build()` may result in undefined behavior if the data
/// is not valid.
skip_validation: bool,
}

impl ArrayDataBuilder {
Expand All @@ -1801,6 +1819,8 @@ impl ArrayDataBuilder {
offset: 0,
buffers: vec![],
child_data: vec![],
align_buffers: false,
skip_validation: false,
}
}

Expand Down Expand Up @@ -1877,51 +1897,79 @@ impl ArrayDataBuilder {

/// Creates an array data, without any validation
///
/// Note: This is shorthand for `self.with_skip_validation(true).build()`
///
/// # Safety
///
/// The same caveats as [`ArrayData::new_unchecked`]
/// apply.
#[allow(clippy::let_and_return)]
pub unsafe fn build_unchecked(self) -> ArrayData {
let data = self.build_impl();
// Provide a force_validate mode
#[cfg(feature = "force_validate")]
data.validate_data().unwrap();
data
self.skip_validation(true).build().unwrap()
}

/// Same as [`Self::build_unchecked`] but ignoring `force_validate` feature flag
unsafe fn build_impl(self) -> ArrayData {
let nulls = self
.nulls
/// Creates an `ArrayData`, consuming `self`
///
/// # Safety
///
/// By default the underlying buffers are checked to ensure they are valid
/// Arrow data. However, if the [`Self::skip_validation`] flag has been set
/// to true (by the `unsafe` API) this validation is skipped. If the data is
/// not valid, undefined behavior will result.
pub fn build(self) -> Result<ArrayData, ArrowError> {
let Self {
data_type,
len,
null_count,
null_bit_buffer,
nulls,
offset,
buffers,
child_data,
align_buffers,
skip_validation,
} = self;

let nulls = nulls
.or_else(|| {
let buffer = self.null_bit_buffer?;
let buffer = BooleanBuffer::new(buffer, self.offset, self.len);
Some(match self.null_count {
Some(n) => NullBuffer::new_unchecked(buffer, n),
let buffer = null_bit_buffer?;
let buffer = BooleanBuffer::new(buffer, offset, len);
Some(match null_count {
Some(n) => {
// SAFETY: call to `data.validate_data()` below validates the null buffer is valid
unsafe { NullBuffer::new_unchecked(buffer, n) }
}
None => NullBuffer::new(buffer),
})
})
.filter(|b| b.null_count() != 0);

ArrayData {
data_type: self.data_type,
len: self.len,
offset: self.offset,
buffers: self.buffers,
child_data: self.child_data,
let mut data = ArrayData {
data_type,
len,
offset,
buffers,
child_data,
nulls,
};

if align_buffers {
data.align_buffers();
}
}

/// Creates an array data, validating all inputs
pub fn build(self) -> Result<ArrayData, ArrowError> {
let data = unsafe { self.build_impl() };
data.validate_data()?;
// SAFETY: `skip_validation` is only set to true using `unsafe` APIs
if !skip_validation || cfg!(feature = "force_validate") {
data.validate_data()?;
}
Ok(data)
}

/// Creates an array data, validating all inputs, and aligning any buffers
#[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
self.align_buffers(true).build()
}

/// Ensure that all buffers are aligned, copying data if necessary
///
/// Rust requires that arrays are aligned to their corresponding primitive,
/// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
Expand All @@ -1930,17 +1978,33 @@ impl ArrayDataBuilder {
/// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
///
/// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
/// this method is provided to automatically copy buffers to a new correctly aligned allocation
/// this flag is provided to automatically copy buffers to a new correctly aligned allocation
/// when necessary, making it useful when interacting with buffers produced by other systems,
/// e.g. IPC or FFI.
///
/// This is unlike `[Self::build`] which will instead return an error on encountering
/// If this flag is not enabled, `[Self::build`] return an error on encountering
/// insufficiently aligned buffers.
pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
let mut data = unsafe { self.build_impl() };
data.align_buffers();
data.validate_data()?;
Ok(data)
pub fn align_buffers(mut self, align_buffers: bool) -> Self {
self.align_buffers = align_buffers;
self
}

/// Skips validation of the data.
///
/// If this flag is enabled, `[Self::build`] will skip validation of the
/// data
///
/// If this flag is not enabled, `[Self::build`] will validate that all
/// buffers are valid and will return an error if any data is invalid.
/// Validation can be expensive.
///
/// # Safety
///
/// If validation is skipped, the buffers must form a valid Arrow array,
/// otherwise undefined behavior will result
pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
self.skip_validation = skip_validation;
self
}
}

Expand All @@ -1955,6 +2019,8 @@ impl From<ArrayData> for ArrayDataBuilder {
nulls: d.nulls,
null_bit_buffer: None,
null_count: None,
align_buffers: false,
skip_validation: false,
}
}
}
Expand Down
48 changes: 14 additions & 34 deletions arrow-ipc/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,17 +177,13 @@ fn create_array(
let values = create_array(reader, values_field, variadic_counts, require_alignment)?;

let run_array_length = run_node.length() as usize;
let builder = ArrayData::builder(data_type.clone())
let array_data = ArrayData::builder(data_type.clone())
.len(run_array_length)
.offset(0)
.add_child_data(run_ends.into_data())
.add_child_data(values.into_data());

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
.add_child_data(values.into_data())
.align_buffers(!require_alignment)
.build()?;

Ok(make_array(array_data))
}
Expand Down Expand Up @@ -257,15 +253,11 @@ fn create_array(
)));
}

let builder = ArrayData::builder(data_type.clone())
let array_data = ArrayData::builder(data_type.clone())
.len(length as usize)
.offset(0);

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
.offset(0)
.align_buffers(!require_alignment)
.build()?;

// no buffer increases
Ok(Arc::new(NullArray::from(array_data)))
Expand Down Expand Up @@ -311,11 +303,7 @@ fn create_primitive_array(
t => unreachable!("Data type {:?} either unsupported or not primitive", t),
};

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
let array_data = builder.align_buffers(!require_alignment).build()?;

Ok(make_array(array_data))
}
Expand Down Expand Up @@ -347,11 +335,7 @@ fn create_list_array(
_ => unreachable!("Cannot create list or map array from {:?}", data_type),
};

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
let array_data = builder.align_buffers(!require_alignment).build()?;

Ok(make_array(array_data))
}
Expand All @@ -367,17 +351,13 @@ fn create_dictionary_array(
) -> Result<ArrayRef, ArrowError> {
if let Dictionary(_, _) = *data_type {
let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone());
let builder = ArrayData::builder(data_type.clone())
let array_data = ArrayData::builder(data_type.clone())
.len(field_node.length() as usize)
.add_buffer(buffers[1].clone())
.add_child_data(value_array.into_data())
.null_bit_buffer(null_buffer);

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
.null_bit_buffer(null_buffer)
.align_buffers(!require_alignment)
.build()?;

Ok(make_array(array_data))
} else {
Expand Down
Loading

0 comments on commit eeabc49

Please sign in to comment.