diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index b331559bf..63d6c955a 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -29,6 +29,8 @@ H5_REF = special_dtype(ref=Reference) H5_REGREF = special_dtype(ref=RegionReference) +RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB + H5PY_3 = h5py.__version__.startswith('3') @@ -745,7 +747,7 @@ def __read_ref(self, h5obj): def open(self): if self.__file is None: open_flag = self.__mode - kwargs = dict() + kwargs = dict(rdcc_nbytes=RDCC_NBYTES) if self.comm: kwargs.update(driver='mpio', comm=self.comm) diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index dfe552e8c..2df001952 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -154,7 +154,7 @@ class GenericDataChunkIterator(AbstractDataChunkIterator): doc=( "If chunk_shape is not specified, it will be inferred as the smallest chunk " "below the chunk_mb threshold.", - "Defaults to 1MB.", + "Defaults to 10MB.", ), default=None, ), @@ -187,9 +187,8 @@ def __init__(self, **kwargs): Advanced users are offered full control over the shape parameters for the buffer and the chunks; however, the chunk shape must perfectly divide the buffer shape along each axis. - HDF5 also recommends not setting chunk_mb greater than 1 MB for optimal caching speeds. - See https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf - for more details. + HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance. + https://youtu.be/rcS5vt-mKok?t=621 """ buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, self.progress_bar_options = getargs( "buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs @@ -198,7 +197,7 @@ def __init__(self, **kwargs): if buffer_gb is None and buffer_shape is None: buffer_gb = 1.0 if chunk_mb is None and chunk_shape is None: - chunk_mb = 1.0 + chunk_mb = 10.0 assert (buffer_gb is not None) != ( buffer_shape is not None ), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!" diff --git a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py index 7df2eac39..39a57d75c 100644 --- a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py +++ b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py @@ -277,7 +277,7 @@ def test_numpy_array_chunk_iterator(self): def test_buffer_shape_option(self): expected_buffer_shape = (1580, 316) - iterator_options = dict(buffer_shape=expected_buffer_shape) + iterator_options = dict(buffer_shape=expected_buffer_shape, chunk_mb=1.0) self.check_first_data_chunk_call( expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in expected_buffer_shape]), iterator_options=iterator_options, @@ -285,9 +285,9 @@ def test_buffer_shape_option(self): self.check_direct_hdf5_write(iterator_options=iterator_options) def test_buffer_gb_option(self): - # buffer is smaller than default chunk; should collapse to chunk shape + # buffer is smaller than chunk; should collapse to chunk shape resulting_buffer_shape = (1580, 316) - iterator_options = dict(buffer_gb=0.0005) + iterator_options = dict(buffer_gb=0.0005, chunk_mb=1.0) self.check_first_data_chunk_call( expected_selection=tuple( [ @@ -334,14 +334,14 @@ def test_chunk_mb_option_while_condition(self): """Test to evoke while condition of default shaping method.""" expected_chunk_shape = (2, 79, 79) special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(2, 2000, 2000), dtype="int16") - iterator = self.TestNumpyArrayDataChunkIterator(array=special_array) + iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0) self.assertEqual(iterator.chunk_shape, expected_chunk_shape) def test_chunk_mb_option_while_condition_unit_maxshape_axis(self): """Test to evoke while condition of default shaping method.""" expected_chunk_shape = (1, 79, 79) special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(1, 2000, 2000), dtype="int16") - iterator = self.TestNumpyArrayDataChunkIterator(array=special_array) + iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0) self.assertEqual(iterator.chunk_shape, expected_chunk_shape) @unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed")