diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 75158987..9ae6e853 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -10,9 +10,10 @@ jobs: with: fetch-depth: 0 - - name: Install odin-data + - name: Test odin-data run: | python3 -m venv venv && source venv/bin/activate && pip install --upgrade pip cd python - pip install . + pip install .[dev,meta_writer] python -c "from odin_data import __version__; print(__version__)" + pytest tests/test_hdf5dataset.py diff --git a/python/setup.cfg b/python/setup.cfg index feb0bf2c..d322cd97 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -34,7 +34,7 @@ meta_writer = h5py>=2.9.0 # For development tests/docs dev = - pytest-asyncio + pytest # Docs sphinx-autobuild sphinx-external-toc diff --git a/python/src/odin_data/meta_writer/hdf5dataset.py b/python/src/odin_data/meta_writer/hdf5dataset.py index bb564fab..9982fff4 100644 --- a/python/src/odin_data/meta_writer/hdf5dataset.py +++ b/python/src/odin_data/meta_writer/hdf5dataset.py @@ -1,6 +1,7 @@ import logging from time import time +import h5py as h5 import numpy as np @@ -12,7 +13,7 @@ def __init__(self, shape, fillvalue, dtype): class HDF5UnlimitedCache(object): - """ An object to represent a cache used by an HDF5Dataset """ + """An object to represent a cache used by an HDF5Dataset""" def __init__(self, name, dtype, fillvalue, block_size, data_shape, block_timeout): """ @@ -156,7 +157,13 @@ def __init__( self.fillvalue = fillvalue self.shape = shape if shape is not None else (0,) * rank self.chunks = chunks - self.maxshape = maxshape if maxshape is not None else shape if shape is not None else (None,) * rank + self.maxshape = ( + maxshape + if maxshape is not None + else shape + if shape is not None + else (None,) * rank + ) self._cache = None data_shape = self.shape[1:] # The shape of each data element in dataset @@ -241,9 +248,9 @@ def write(self, data): "%s | Data shape %s does not match dataset shape %s" " and resize failed:\n%s", self.name, - self.shape, data.shape, - exception + self.shape, + exception, ) return @@ -392,7 +399,8 @@ class StringHDF5Dataset(HDF5Dataset): def __init__( self, name, - length, + length=None, + encoding="utf-8", shape=None, maxshape=None, cache=True, @@ -402,12 +410,14 @@ def __init__( ): """ Args: - length(int): Maximum length of the string elements + length(int): Length of string - None to store as variable-length objects + encoding(str): Encoding of string - either `"utf-8"` or `"ascii"` """ + self.dtype = h5.string_dtype(encoding=encoding, length=length) super(StringHDF5Dataset, self).__init__( name, - dtype="S{}".format(length), + dtype=self.dtype, fillvalue="", shape=shape, maxshape=maxshape, @@ -420,7 +430,11 @@ def __init__( def prepare_data(self, data): """Prepare data ready to write to hdf5 dataset - Convert data to raw strings to write to hdf5 dataset + Ensure data array contains the correct string dtype by converting first to bytes + and then to the specific string type. This initial conversion is required if + self.dtype is a variable-length string object data is not a string type because, + e.g., np.asarray(ndarray[int], object) leaves the data as int, but int cannot be + implicitly converted to an h5py dataset of dtype 'O' on write. Args: data(np.ndarray): Data to format @@ -431,7 +445,12 @@ def prepare_data(self, data): """ data = super(StringHDF5Dataset, self).prepare_data(data) - # Make sure data array contains str - data = np.array(data, dtype=str) + if self.dtype.kind == "O" and data.dtype.kind != "S": + # Ensure data is bytes before converting to variable-length object + data = np.asarray(data, dtype=bytes) + + # Convert to specific string dtype - either variable-length object or + # fixed-length string. No copy is performed if the dtype already matches. + data = np.asarray(data, dtype=self.dtype) return data diff --git a/python/tests/test_hdf5dataset.py b/python/tests/test_hdf5dataset.py index 47834e80..ba9b2cbb 100644 --- a/python/tests/test_hdf5dataset.py +++ b/python/tests/test_hdf5dataset.py @@ -1,11 +1,11 @@ +import os from unittest import TestCase -import logging -import pytest import numpy -import os import time -from odin_data.meta_writer.hdf5dataset import HDF5UnlimitedCache +import h5py as h5 + +from odin_data.meta_writer.hdf5dataset import HDF5UnlimitedCache, StringHDF5Dataset class _TestMockDataset: @@ -32,7 +32,14 @@ def test_unlimited_cache_1D(self): """verify that the cache functions as expected""" # Create a cache - cache = HDF5UnlimitedCache("test1", "int32", -2, 10, 1) + cache = HDF5UnlimitedCache( + name="test1", + dtype="int32", + fillvalue=-2, + block_size=10, + data_shape=(1,), + block_timeout=0.01, + ) # Verify 1 block has been created, with a size of 10 self.assertEqual(len(cache._blocks), 1) @@ -100,8 +107,8 @@ def test_unlimited_cache_1D(self): self.assertEqual(ds.values[1][8], -2) self.assertEqual(ds.values[1][9], -2) - # Now wait for 2 seconds - time.sleep(2.0) + # Now wait + time.sleep(0.1) # Now write another value into block[2] and then flush once again cache.add_value(3, 26) @@ -132,7 +139,14 @@ def test_unlimited_cache_2D(self): """verify that the cache functions as expected""" # Create a cache - cache = HDF5UnlimitedCache("test1", "int32", -2, 10, 1, shape=(2, 3)) + cache = HDF5UnlimitedCache( + name="test1", + dtype="int32", + fillvalue=-2, + block_size=10, + data_shape=(2, 3), + block_timeout=0.01, + ) # Verify 1 block has been created, with a size of 10x2x3 self.assertEqual(len(cache._blocks), 1) @@ -203,8 +217,8 @@ def test_unlimited_cache_2D(self): self.assertEqual(ds.values[1][1][1][1], -2) self.assertEqual(ds.values[1][1][1][2], -2) - # Now wait for 2 seconds - time.sleep(2.0) + # Now wait + time.sleep(0.1) # Now write another value into block[2] and then flush once again cache.add_value(value, 26) @@ -245,3 +259,62 @@ def test_unlimited_cache_2D(self): self.assertEqual(ds.values[0][6][1][0], 4) self.assertEqual(ds.values[0][6][1][1], 5) self.assertEqual(ds.values[0][6][1][2], 6) + + +def test_string_types(): + variable_utf = StringHDF5Dataset("variable_utf", encoding="utf-8", cache=False) + variable_ascii = StringHDF5Dataset("variable_ascii", encoding="ascii", cache=False) + fixed_utf = StringHDF5Dataset("fixed_utf", encoding="utf-8", length=9, cache=False) + fixed_ascii = StringHDF5Dataset( + "fixed_ascii", encoding="ascii", length=11, cache=False + ) + variable_utf_int = StringHDF5Dataset( + "variable_utf_int", encoding="utf-8", cache=False + ) + + with h5.File("/tmp/strings.h5", "w") as f: + variable_utf.initialise( + f.create_dataset( + "variable_utf", shape=(1,), maxshape=(1,), dtype=variable_utf.dtype + ), + 0, + ) + variable_ascii.initialise( + f.create_dataset( + "variable_ascii", shape=(1,), maxshape=(1,), dtype=variable_ascii.dtype + ), + 0, + ) + fixed_utf.initialise( + f.create_dataset( + "fixed_utf", shape=(1,), maxshape=(1,), dtype=fixed_utf.dtype + ), + 0, + ) + fixed_ascii.initialise( + f.create_dataset( + "fixed_ascii", shape=(1,), maxshape=(1,), dtype=fixed_ascii.dtype + ), + 0, + ) + variable_utf_int.initialise( + f.create_dataset( + "variable_utf_int", shape=(1,), maxshape=(1,), dtype=variable_utf.dtype + ), + 0, + ) + + variable_utf.write("variable_utf") + variable_ascii.write("variable_ascii") + fixed_utf.write("fixed_utf") + fixed_ascii.write("fixed_ascii") + variable_utf_int.write(2) # Check non-strings can be handled + + with h5.File("/tmp/strings.h5", "r") as f: + assert f["variable_utf"][0] == b"variable_utf" + assert f["variable_ascii"][0] == b"variable_ascii" + assert f["fixed_utf"][0] == b"fixed_utf" + assert f["fixed_ascii"][0] == b"fixed_ascii" + assert f["variable_utf_int"][0] == b"2" + + os.remove("/tmp/strings.h5")