From be3a8aba184f94abf34a0e667dda638188df058d Mon Sep 17 00:00:00 2001
From: pierce <48131946+pierce314159@users.noreply.github.com>
Date: Fri, 29 Sep 2023 12:34:49 -0400
Subject: [PATCH] Closes #2789: Updates to HDF5 for Index (#2792)

This PR (closes #2789) makes small updates to the index hdf5 code

Co-authored-by: Pierce Hayes <pierce314159@users.noreply.github.com>
---
 PROTO_tests/tests/io_test.py | 44 ++++++++++++++++++++++++++++--------
 arkouda/index.py             |  2 +-
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/PROTO_tests/tests/io_test.py b/PROTO_tests/tests/io_test.py
index bbceece964..3d654258a2 100644
--- a/PROTO_tests/tests/io_test.py
+++ b/PROTO_tests/tests/io_test.py
@@ -1303,18 +1303,44 @@ def test_multi_index(self, dtype1, dtype2, size):
 
             assert isinstance(rd_idx, ak.MultiIndex)
             assert idx.to_list() == rd_idx.to_list()
+
         # handle categorical cases as well
-        if dtype1 == ak.str_:
-            t1 = ak.Categorical(t1)
-        if dtype2 == ak.str_:
-            t2 = ak.Categorical(t2)
-        idx = ak.Index.factory([t1, t2])
+        if ak.str_ in [dtype1, dtype2]:
+            if dtype1 == ak.str_:
+                t1 = ak.Categorical(t1)
+            if dtype2 == ak.str_:
+                t2 = ak.Categorical(t2)
+            idx = ak.Index.factory([t1, t2])
+            with tempfile.TemporaryDirectory(dir=TestHDF5.hdf_test_base_tmp) as tmp_dirname:
+                idx.to_hdf(f"{tmp_dirname}/idx_test")
+                rd_idx = ak.read_hdf(f"{tmp_dirname}/idx_test*")
+
+                assert isinstance(rd_idx, ak.MultiIndex)
+                assert idx.to_list() == rd_idx.to_list()
+
+    def test_hdf_overwrite_index(self):
+        # test repack with a single object
+        a = ak.Index(ak.arange(1000))
+        b = ak.Index(ak.randint(0, 100, 1000))
+        c = ak.Index(ak.arange(15))
         with tempfile.TemporaryDirectory(dir=TestHDF5.hdf_test_base_tmp) as tmp_dirname:
-            idx.to_hdf(f"{tmp_dirname}/idx_test")
-            rd_idx = ak.read_hdf(f"{tmp_dirname}/idx_test*")
+            file_name = f"{tmp_dirname}/idx_test"
+            for repack in [True, False]:
+                a.to_hdf(file_name, dataset="index")
+                b.to_hdf(file_name, dataset="index_2", mode="append")
+                f_list = glob.glob(f"{file_name}*")
+                orig_size = sum(os.path.getsize(f) for f in f_list)
+                # hdf5 only releases memory if overwriting last dset so overwrite first
+                c.update_hdf(file_name, dataset="index", repack=repack)
 
-            assert isinstance(rd_idx, ak.MultiIndex)
-            assert idx.to_list() == rd_idx.to_list()
+                new_size = sum(os.path.getsize(f) for f in f_list)
+
+                # ensure that the column was actually overwritten
+                # test that repack on/off the file gets smaller/larger respectively
+                assert new_size < orig_size if repack else new_size >= orig_size
+                data = ak.read_hdf(f"{file_name}*")
+                assert isinstance(data["index"], ak.Index)
+                assert data["index"].to_list() == c.to_list()
 
     def test_special_objtype(self):
         """
diff --git a/arkouda/index.py b/arkouda/index.py
index 69b2514d08..a13c7de5a1 100644
--- a/arkouda/index.py
+++ b/arkouda/index.py
@@ -437,7 +437,7 @@ def to_hdf(
     def update_hdf(
         self,
         prefix_path: str,
-        dataset: str = "array",
+        dataset: str = "index",
         repack: bool = True,
     ):
         """