Skip to content

Commit

Permalink
Merge pull request #324 from odin-detector/meta-string-types
Browse files Browse the repository at this point in the history
Update to use improved h5py string support
  • Loading branch information
GDYendell authored Sep 29, 2023
2 parents d3b3bef + 7bb0040 commit 9f25ddc
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 23 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ jobs:
with:
fetch-depth: 0

- name: Install odin-data
- name: Test odin-data
run: |
python3 -m venv venv && source venv/bin/activate && pip install --upgrade pip
cd python
pip install .
pip install .[dev,meta_writer]
python -c "from odin_data import __version__; print(__version__)"
pytest tests/test_hdf5dataset.py
2 changes: 1 addition & 1 deletion python/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ meta_writer =
h5py>=2.9.0
# For development tests/docs
dev =
pytest-asyncio
pytest
# Docs
sphinx-autobuild
sphinx-external-toc
Expand Down
39 changes: 29 additions & 10 deletions python/src/odin_data/meta_writer/hdf5dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from time import time

import h5py as h5
import numpy as np


Expand All @@ -12,7 +13,7 @@ def __init__(self, shape, fillvalue, dtype):


class HDF5UnlimitedCache(object):
""" An object to represent a cache used by an HDF5Dataset """
"""An object to represent a cache used by an HDF5Dataset"""

def __init__(self, name, dtype, fillvalue, block_size, data_shape, block_timeout):
"""
Expand Down Expand Up @@ -156,7 +157,13 @@ def __init__(
self.fillvalue = fillvalue
self.shape = shape if shape is not None else (0,) * rank
self.chunks = chunks
self.maxshape = maxshape if maxshape is not None else shape if shape is not None else (None,) * rank
self.maxshape = (
maxshape
if maxshape is not None
else shape
if shape is not None
else (None,) * rank
)
self._cache = None

data_shape = self.shape[1:] # The shape of each data element in dataset
Expand Down Expand Up @@ -241,9 +248,9 @@ def write(self, data):
"%s | Data shape %s does not match dataset shape %s"
" and resize failed:\n%s",
self.name,
self.shape,
data.shape,
exception
self.shape,
exception,
)
return

Expand Down Expand Up @@ -392,7 +399,8 @@ class StringHDF5Dataset(HDF5Dataset):
def __init__(
self,
name,
length,
length=None,
encoding="utf-8",
shape=None,
maxshape=None,
cache=True,
Expand All @@ -402,12 +410,14 @@ def __init__(
):
"""
Args:
length(int): Maximum length of the string elements
length(int): Length of string - None to store as variable-length objects
encoding(str): Encoding of string - either `"utf-8"` or `"ascii"`
"""
self.dtype = h5.string_dtype(encoding=encoding, length=length)
super(StringHDF5Dataset, self).__init__(
name,
dtype="S{}".format(length),
dtype=self.dtype,
fillvalue="",
shape=shape,
maxshape=maxshape,
Expand All @@ -420,7 +430,11 @@ def __init__(
def prepare_data(self, data):
"""Prepare data ready to write to hdf5 dataset
Convert data to raw strings to write to hdf5 dataset
Ensure data array contains the correct string dtype by converting first to bytes
and then to the specific string type. This initial conversion is required if
self.dtype is a variable-length string object data is not a string type because,
e.g., np.asarray(ndarray[int], object) leaves the data as int, but int cannot be
implicitly converted to an h5py dataset of dtype 'O' on write.
Args:
data(np.ndarray): Data to format
Expand All @@ -431,7 +445,12 @@ def prepare_data(self, data):
"""
data = super(StringHDF5Dataset, self).prepare_data(data)

# Make sure data array contains str
data = np.array(data, dtype=str)
if self.dtype.kind == "O" and data.dtype.kind != "S":
# Ensure data is bytes before converting to variable-length object
data = np.asarray(data, dtype=bytes)

# Convert to specific string dtype - either variable-length object or
# fixed-length string. No copy is performed if the dtype already matches.
data = np.asarray(data, dtype=self.dtype)

return data
93 changes: 83 additions & 10 deletions python/tests/test_hdf5dataset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os
from unittest import TestCase
import logging
import pytest
import numpy
import os
import time

from odin_data.meta_writer.hdf5dataset import HDF5UnlimitedCache
import h5py as h5

from odin_data.meta_writer.hdf5dataset import HDF5UnlimitedCache, StringHDF5Dataset


class _TestMockDataset:
Expand All @@ -32,7 +32,14 @@ def test_unlimited_cache_1D(self):
"""verify that the cache functions as expected"""

# Create a cache
cache = HDF5UnlimitedCache("test1", "int32", -2, 10, 1)
cache = HDF5UnlimitedCache(
name="test1",
dtype="int32",
fillvalue=-2,
block_size=10,
data_shape=(1,),
block_timeout=0.01,
)

# Verify 1 block has been created, with a size of 10
self.assertEqual(len(cache._blocks), 1)
Expand Down Expand Up @@ -100,8 +107,8 @@ def test_unlimited_cache_1D(self):
self.assertEqual(ds.values[1][8], -2)
self.assertEqual(ds.values[1][9], -2)

# Now wait for 2 seconds
time.sleep(2.0)
# Now wait
time.sleep(0.1)

# Now write another value into block[2] and then flush once again
cache.add_value(3, 26)
Expand Down Expand Up @@ -132,7 +139,14 @@ def test_unlimited_cache_2D(self):
"""verify that the cache functions as expected"""

# Create a cache
cache = HDF5UnlimitedCache("test1", "int32", -2, 10, 1, shape=(2, 3))
cache = HDF5UnlimitedCache(
name="test1",
dtype="int32",
fillvalue=-2,
block_size=10,
data_shape=(2, 3),
block_timeout=0.01,
)

# Verify 1 block has been created, with a size of 10x2x3
self.assertEqual(len(cache._blocks), 1)
Expand Down Expand Up @@ -203,8 +217,8 @@ def test_unlimited_cache_2D(self):
self.assertEqual(ds.values[1][1][1][1], -2)
self.assertEqual(ds.values[1][1][1][2], -2)

# Now wait for 2 seconds
time.sleep(2.0)
# Now wait
time.sleep(0.1)

# Now write another value into block[2] and then flush once again
cache.add_value(value, 26)
Expand Down Expand Up @@ -245,3 +259,62 @@ def test_unlimited_cache_2D(self):
self.assertEqual(ds.values[0][6][1][0], 4)
self.assertEqual(ds.values[0][6][1][1], 5)
self.assertEqual(ds.values[0][6][1][2], 6)


def test_string_types():
variable_utf = StringHDF5Dataset("variable_utf", encoding="utf-8", cache=False)
variable_ascii = StringHDF5Dataset("variable_ascii", encoding="ascii", cache=False)
fixed_utf = StringHDF5Dataset("fixed_utf", encoding="utf-8", length=9, cache=False)
fixed_ascii = StringHDF5Dataset(
"fixed_ascii", encoding="ascii", length=11, cache=False
)
variable_utf_int = StringHDF5Dataset(
"variable_utf_int", encoding="utf-8", cache=False
)

with h5.File("/tmp/strings.h5", "w") as f:
variable_utf.initialise(
f.create_dataset(
"variable_utf", shape=(1,), maxshape=(1,), dtype=variable_utf.dtype
),
0,
)
variable_ascii.initialise(
f.create_dataset(
"variable_ascii", shape=(1,), maxshape=(1,), dtype=variable_ascii.dtype
),
0,
)
fixed_utf.initialise(
f.create_dataset(
"fixed_utf", shape=(1,), maxshape=(1,), dtype=fixed_utf.dtype
),
0,
)
fixed_ascii.initialise(
f.create_dataset(
"fixed_ascii", shape=(1,), maxshape=(1,), dtype=fixed_ascii.dtype
),
0,
)
variable_utf_int.initialise(
f.create_dataset(
"variable_utf_int", shape=(1,), maxshape=(1,), dtype=variable_utf.dtype
),
0,
)

variable_utf.write("variable_utf")
variable_ascii.write("variable_ascii")
fixed_utf.write("fixed_utf")
fixed_ascii.write("fixed_ascii")
variable_utf_int.write(2) # Check non-strings can be handled

with h5.File("/tmp/strings.h5", "r") as f:
assert f["variable_utf"][0] == b"variable_utf"
assert f["variable_ascii"][0] == b"variable_ascii"
assert f["fixed_utf"][0] == b"fixed_utf"
assert f["fixed_ascii"][0] == b"fixed_ascii"
assert f["variable_utf_int"][0] == b"2"

os.remove("/tmp/strings.h5")

0 comments on commit 9f25ddc

Please sign in to comment.