Skip to content

Commit

Permalink
change chunk default size to 10MB (#925)
Browse files Browse the repository at this point in the history
Co-authored-by: Ryan Ly <rly@lbl.gov>
  • Loading branch information
bendichter and rly authored Aug 8, 2023
1 parent 64a444f commit 9e194a4
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 11 deletions.
4 changes: 3 additions & 1 deletion src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
H5_REF = special_dtype(ref=Reference)
H5_REGREF = special_dtype(ref=RegionReference)

RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB

H5PY_3 = h5py.__version__.startswith('3')


Expand Down Expand Up @@ -745,7 +747,7 @@ def __read_ref(self, h5obj):
def open(self):
if self.__file is None:
open_flag = self.__mode
kwargs = dict()
kwargs = dict(rdcc_nbytes=RDCC_NBYTES)
if self.comm:
kwargs.update(driver='mpio', comm=self.comm)

Expand Down
9 changes: 4 additions & 5 deletions src/hdmf/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class GenericDataChunkIterator(AbstractDataChunkIterator):
doc=(
"If chunk_shape is not specified, it will be inferred as the smallest chunk "
"below the chunk_mb threshold.",
"Defaults to 1MB.",
"Defaults to 10MB.",
),
default=None,
),
Expand Down Expand Up @@ -187,9 +187,8 @@ def __init__(self, **kwargs):
Advanced users are offered full control over the shape parameters for the buffer and the chunks; however,
the chunk shape must perfectly divide the buffer shape along each axis.
HDF5 also recommends not setting chunk_mb greater than 1 MB for optimal caching speeds.
See https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf
for more details.
HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance.
https://youtu.be/rcS5vt-mKok?t=621
"""
buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, self.progress_bar_options = getargs(
"buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs
Expand All @@ -198,7 +197,7 @@ def __init__(self, **kwargs):
if buffer_gb is None and buffer_shape is None:
buffer_gb = 1.0
if chunk_mb is None and chunk_shape is None:
chunk_mb = 1.0
chunk_mb = 10.0
assert (buffer_gb is not None) != (
buffer_shape is not None
), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!"
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/utils_test/test_core_GenericDataChunkIterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,17 +277,17 @@ def test_numpy_array_chunk_iterator(self):

def test_buffer_shape_option(self):
expected_buffer_shape = (1580, 316)
iterator_options = dict(buffer_shape=expected_buffer_shape)
iterator_options = dict(buffer_shape=expected_buffer_shape, chunk_mb=1.0)
self.check_first_data_chunk_call(
expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in expected_buffer_shape]),
iterator_options=iterator_options,
)
self.check_direct_hdf5_write(iterator_options=iterator_options)

def test_buffer_gb_option(self):
# buffer is smaller than default chunk; should collapse to chunk shape
# buffer is smaller than chunk; should collapse to chunk shape
resulting_buffer_shape = (1580, 316)
iterator_options = dict(buffer_gb=0.0005)
iterator_options = dict(buffer_gb=0.0005, chunk_mb=1.0)
self.check_first_data_chunk_call(
expected_selection=tuple(
[
Expand Down Expand Up @@ -334,14 +334,14 @@ def test_chunk_mb_option_while_condition(self):
"""Test to evoke while condition of default shaping method."""
expected_chunk_shape = (2, 79, 79)
special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(2, 2000, 2000), dtype="int16")
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array)
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0)
self.assertEqual(iterator.chunk_shape, expected_chunk_shape)

def test_chunk_mb_option_while_condition_unit_maxshape_axis(self):
"""Test to evoke while condition of default shaping method."""
expected_chunk_shape = (1, 79, 79)
special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(1, 2000, 2000), dtype="int16")
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array)
iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0)
self.assertEqual(iterator.chunk_shape, expected_chunk_shape)

@unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed")
Expand Down

0 comments on commit 9e194a4

Please sign in to comment.