Skip to content

Commit

Permalink
Closes Bears-R-Us#2789: Updates to HDF5 for Index (Bears-R-Us#2792)
Browse files Browse the repository at this point in the history
This PR (closes Bears-R-Us#2789) makes small updates to the index hdf5 code

Co-authored-by: Pierce Hayes <pierce314159@users.noreply.github.com>
  • Loading branch information
stress-tess and Pierce Hayes authored Sep 29, 2023
1 parent 078a8d7 commit be3a8ab
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 10 deletions.
44 changes: 35 additions & 9 deletions PROTO_tests/tests/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,18 +1303,44 @@ def test_multi_index(self, dtype1, dtype2, size):

assert isinstance(rd_idx, ak.MultiIndex)
assert idx.to_list() == rd_idx.to_list()

# handle categorical cases as well
if dtype1 == ak.str_:
t1 = ak.Categorical(t1)
if dtype2 == ak.str_:
t2 = ak.Categorical(t2)
idx = ak.Index.factory([t1, t2])
if ak.str_ in [dtype1, dtype2]:
if dtype1 == ak.str_:
t1 = ak.Categorical(t1)
if dtype2 == ak.str_:
t2 = ak.Categorical(t2)
idx = ak.Index.factory([t1, t2])
with tempfile.TemporaryDirectory(dir=TestHDF5.hdf_test_base_tmp) as tmp_dirname:
idx.to_hdf(f"{tmp_dirname}/idx_test")
rd_idx = ak.read_hdf(f"{tmp_dirname}/idx_test*")

assert isinstance(rd_idx, ak.MultiIndex)
assert idx.to_list() == rd_idx.to_list()

def test_hdf_overwrite_index(self):
# test repack with a single object
a = ak.Index(ak.arange(1000))
b = ak.Index(ak.randint(0, 100, 1000))
c = ak.Index(ak.arange(15))
with tempfile.TemporaryDirectory(dir=TestHDF5.hdf_test_base_tmp) as tmp_dirname:
idx.to_hdf(f"{tmp_dirname}/idx_test")
rd_idx = ak.read_hdf(f"{tmp_dirname}/idx_test*")
file_name = f"{tmp_dirname}/idx_test"
for repack in [True, False]:
a.to_hdf(file_name, dataset="index")
b.to_hdf(file_name, dataset="index_2", mode="append")
f_list = glob.glob(f"{file_name}*")
orig_size = sum(os.path.getsize(f) for f in f_list)
# hdf5 only releases memory if overwriting last dset so overwrite first
c.update_hdf(file_name, dataset="index", repack=repack)

assert isinstance(rd_idx, ak.MultiIndex)
assert idx.to_list() == rd_idx.to_list()
new_size = sum(os.path.getsize(f) for f in f_list)

# ensure that the column was actually overwritten
# test that repack on/off the file gets smaller/larger respectively
assert new_size < orig_size if repack else new_size >= orig_size
data = ak.read_hdf(f"{file_name}*")
assert isinstance(data["index"], ak.Index)
assert data["index"].to_list() == c.to_list()

def test_special_objtype(self):
"""
Expand Down
2 changes: 1 addition & 1 deletion arkouda/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def to_hdf(
def update_hdf(
self,
prefix_path: str,
dataset: str = "array",
dataset: str = "index",
repack: bool = True,
):
"""
Expand Down

0 comments on commit be3a8ab

Please sign in to comment.