Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change chunk default size to 10MB #925

Merged
merged 4 commits into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
H5_REF = special_dtype(ref=Reference)
H5_REGREF = special_dtype(ref=RegionReference)

RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB

H5PY_3 = h5py.__version__.startswith('3')


Expand Down Expand Up @@ -745,7 +747,7 @@ def __read_ref(self, h5obj):
def open(self):
if self.__file is None:
open_flag = self.__mode
kwargs = dict()
kwargs = dict(rdcc_nbytes=RDCC_NBYTES)
if self.comm:
kwargs.update(driver='mpio', comm=self.comm)

Expand Down
9 changes: 4 additions & 5 deletions src/hdmf/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class GenericDataChunkIterator(AbstractDataChunkIterator):
doc=(
"If chunk_shape is not specified, it will be inferred as the smallest chunk "
"below the chunk_mb threshold.",
"Defaults to 1MB.",
"Defaults to 10MB.",
),
default=None,
),
Expand Down Expand Up @@ -187,9 +187,8 @@ def __init__(self, **kwargs):
Advanced users are offered full control over the shape parameters for the buffer and the chunks; however,
the chunk shape must perfectly divide the buffer shape along each axis.

HDF5 also recommends not setting chunk_mb greater than 1 MB for optimal caching speeds.
See https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf
for more details.
HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance.
https://youtu.be/rcS5vt-mKok?t=621
"""
buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, self.progress_bar_options = getargs(
"buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs
Expand All @@ -198,7 +197,7 @@ def __init__(self, **kwargs):
if buffer_gb is None and buffer_shape is None:
buffer_gb = 1.0
if chunk_mb is None and chunk_shape is None:
chunk_mb = 1.0
chunk_mb = 10.0
assert (buffer_gb is not None) != (
buffer_shape is not None
), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!"
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/utils_test/test_core_GenericDataChunkIterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,17 +277,17 @@ def test_numpy_array_chunk_iterator(self):

def test_buffer_shape_option(self):
expected_buffer_shape = (1580, 316)
iterator_options = dict(buffer_shape=expected_buffer_shape)
iterator_options = dict(buffer_shape=expected_buffer_shape, chunk_mb=1.0)
self.check_first_data_chunk_call(
expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in expected_buffer_shape]),
iterator_options=iterator_options,
)
self.check_direct_hdf5_write(iterator_options=iterator_options)

def test_buffer_gb_option(self):
# buffer is smaller than default chunk; should collapse to chunk shape
# buffer is smaller than chunk; should collapse to chunk shape
resulting_buffer_shape = (1580, 316)
iterator_options = dict(buffer_gb=0.0005)
iterator_options = dict(buffer_gb=0.0005, chunk_mb=1.0)
self.check_first_data_chunk_call(
expected_selection=tuple(
[
Expand Down Expand Up @@ -334,14 +334,14 @@ def test_chunk_mb_option_while_condition(self):
"""Test to evoke while condition of default shaping method."""
expected_chunk_shape = (2, 79, 79)
special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(2, 2000, 2000), dtype="int16")
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array)
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0)
self.assertEqual(iterator.chunk_shape, expected_chunk_shape)

def test_chunk_mb_option_while_condition_unit_maxshape_axis(self):
"""Test to evoke while condition of default shaping method."""
expected_chunk_shape = (1, 79, 79)
special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(1, 2000, 2000), dtype="int16")
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array)
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0)
self.assertEqual(iterator.chunk_shape, expected_chunk_shape)

@unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed")
Expand Down