From be3a8aba184f94abf34a0e667dda638188df058d Mon Sep 17 00:00:00 2001 From: pierce <48131946+pierce314159@users.noreply.github.com> Date: Fri, 29 Sep 2023 12:34:49 -0400 Subject: [PATCH] Closes #2789: Updates to HDF5 for Index (#2792) This PR (closes #2789) makes small updates to the index hdf5 code Co-authored-by: Pierce Hayes --- PROTO_tests/tests/io_test.py | 44 ++++++++++++++++++++++++++++-------- arkouda/index.py | 2 +- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/PROTO_tests/tests/io_test.py b/PROTO_tests/tests/io_test.py index bbceece964..3d654258a2 100644 --- a/PROTO_tests/tests/io_test.py +++ b/PROTO_tests/tests/io_test.py @@ -1303,18 +1303,44 @@ def test_multi_index(self, dtype1, dtype2, size): assert isinstance(rd_idx, ak.MultiIndex) assert idx.to_list() == rd_idx.to_list() + # handle categorical cases as well - if dtype1 == ak.str_: - t1 = ak.Categorical(t1) - if dtype2 == ak.str_: - t2 = ak.Categorical(t2) - idx = ak.Index.factory([t1, t2]) + if ak.str_ in [dtype1, dtype2]: + if dtype1 == ak.str_: + t1 = ak.Categorical(t1) + if dtype2 == ak.str_: + t2 = ak.Categorical(t2) + idx = ak.Index.factory([t1, t2]) + with tempfile.TemporaryDirectory(dir=TestHDF5.hdf_test_base_tmp) as tmp_dirname: + idx.to_hdf(f"{tmp_dirname}/idx_test") + rd_idx = ak.read_hdf(f"{tmp_dirname}/idx_test*") + + assert isinstance(rd_idx, ak.MultiIndex) + assert idx.to_list() == rd_idx.to_list() + + def test_hdf_overwrite_index(self): + # test repack with a single object + a = ak.Index(ak.arange(1000)) + b = ak.Index(ak.randint(0, 100, 1000)) + c = ak.Index(ak.arange(15)) with tempfile.TemporaryDirectory(dir=TestHDF5.hdf_test_base_tmp) as tmp_dirname: - idx.to_hdf(f"{tmp_dirname}/idx_test") - rd_idx = ak.read_hdf(f"{tmp_dirname}/idx_test*") + file_name = f"{tmp_dirname}/idx_test" + for repack in [True, False]: + a.to_hdf(file_name, dataset="index") + b.to_hdf(file_name, dataset="index_2", mode="append") + f_list = glob.glob(f"{file_name}*") + orig_size = sum(os.path.getsize(f) for f in f_list) + # hdf5 only releases memory if overwriting last dset so overwrite first + c.update_hdf(file_name, dataset="index", repack=repack) - assert isinstance(rd_idx, ak.MultiIndex) - assert idx.to_list() == rd_idx.to_list() + new_size = sum(os.path.getsize(f) for f in f_list) + + # ensure that the column was actually overwritten + # test that repack on/off the file gets smaller/larger respectively + assert new_size < orig_size if repack else new_size >= orig_size + data = ak.read_hdf(f"{file_name}*") + assert isinstance(data["index"], ak.Index) + assert data["index"].to_list() == c.to_list() def test_special_objtype(self): """ diff --git a/arkouda/index.py b/arkouda/index.py index 69b2514d08..a13c7de5a1 100644 --- a/arkouda/index.py +++ b/arkouda/index.py @@ -437,7 +437,7 @@ def to_hdf( def update_hdf( self, prefix_path: str, - dataset: str = "array", + dataset: str = "index", repack: bool = True, ): """