-
Notifications
You must be signed in to change notification settings - Fork 841
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add simple GC for view array types #5885
Changes from 9 commits
67a4766
4c76bc5
2cb2e0d
f5e964d
92d6de5
2cdfbcd
c4f5fee
a29182c
1b3b589
98d9a61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use arrow_array::StringViewArray; | ||
use criterion::*; | ||
|
||
fn gen_view_array(size: usize) -> StringViewArray { | ||
StringViewArray::from_iter((0..size).map(|v| match v % 3 { | ||
0 => Some("small"), | ||
1 => Some("larger than 12 bytes array"), | ||
2 => None, | ||
_ => unreachable!("unreachable"), | ||
})) | ||
} | ||
|
||
fn criterion_benchmark(c: &mut Criterion) { | ||
let array = gen_view_array(100_000); | ||
|
||
c.bench_function("gc view types all", |b| { | ||
b.iter(|| { | ||
black_box(array.gc()); | ||
}); | ||
}); | ||
|
||
let sliced = array.slice(0, 100_000 / 2); | ||
c.bench_function("gc view types slice half", |b| { | ||
b.iter(|| { | ||
black_box(sliced.gc()); | ||
}); | ||
}); | ||
} | ||
|
||
criterion_group!(benches, criterion_benchmark); | ||
criterion_main!(benches); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -265,6 +265,56 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> { | |
phantom: Default::default(), | ||
} | ||
} | ||
|
||
/// Returns a "compacted" version of this array | ||
/// | ||
/// The original array will *not* be modified | ||
/// | ||
/// # Garbage Collection | ||
/// | ||
/// Before GC: | ||
/// ```text | ||
/// ┌──────┐ | ||
/// │......│ | ||
/// │......│ | ||
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer | ||
/// │ View 1 │─ ─ ─ ─ │......│ with data that | ||
/// ├────────────────────┤ │......│ is not referred | ||
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or | ||
/// └────────────────────┘ │......│ View 2 | ||
/// │......│ | ||
/// 2 views, refer to │......│ | ||
/// small portions of a └──────┘ | ||
/// large buffer | ||
/// ``` | ||
/// | ||
/// After GC: | ||
/// | ||
/// ```text | ||
/// ┌────────────────────┐ ┌─────┐ After gc, only | ||
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is | ||
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by | ||
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is | ||
/// └────────────────────┘ left | ||
/// | ||
/// | ||
/// 2 views | ||
/// ``` | ||
/// This method will compact the data buffers by recreating the view array and only include the data | ||
/// that is pointed to by the views. | ||
/// | ||
/// Note that it will copy the array regardless of whether the original array is compact. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
/// Use with caution as this can be an expensive operation, only use it when you are sure that the view | ||
/// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. | ||
pub fn gc(&self) -> Self { | ||
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len()); | ||
|
||
for v in self.iter() { | ||
builder.append_option(v); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is reasonable (and correct) first version However, it could likely be made faster by special casing inline views -- the code here is going to be doing several checks on length only to copy the bytes back into a u128 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand how the checks can be avoided; even if the view is inlined, we need to change the offset and the block id, so we need to copy that u128 from the original view buffer to the new buffer anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess I was imagining that special casing copying inlined views could potentially be faster than converting to &str and then checking But I don't think we should make any changes unless we have benchmarks supporting that hypothesis |
||
} | ||
|
||
builder.finish() | ||
} | ||
} | ||
|
||
impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> { | ||
|
@@ -645,4 +695,36 @@ mod tests { | |
|
||
StringViewArray::new(views, buffers, None); | ||
} | ||
|
||
#[test] | ||
fn test_gc() { | ||
let test_data = [ | ||
Some("short"), | ||
Some("t"), | ||
Some("longer than 12 bytes"), | ||
None, | ||
Some("short"), | ||
]; | ||
|
||
let array = { | ||
let mut builder = StringViewBuilder::new().with_block_size(8); // create multiple buffers | ||
test_data.into_iter().for_each(|v| builder.append_option(v)); | ||
builder.finish() | ||
}; | ||
|
||
XiangpengHao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fn check_gc(to_test: &StringViewArray) { | ||
let gc = to_test.gc(); | ||
|
||
to_test.iter().zip(gc.iter()).for_each(|(a, b)| { | ||
assert_eq!(a, b); | ||
}); | ||
assert_eq!(to_test.len(), gc.len()); | ||
} | ||
|
||
check_gc(&array); | ||
check_gc(&array.slice(1, 3)); | ||
check_gc(&array.slice(2, 1)); | ||
check_gc(&array.slice(2, 2)); | ||
check_gc(&array.slice(3, 1)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would also be interesting to have a "array with no nulls" benchmark as I could imagine special casing that case