Skip to content

Commit

Permalink
optimzie ndarray_compare for non-vlen arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
jreadey committed Jul 12, 2023
1 parent 073c9ec commit 80e74e2
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 68 deletions.
66 changes: 66 additions & 0 deletions hsds/util/arrayUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,3 +607,69 @@ def __next__(self):
self._stop = True

return tuple(ret_index)


# compare two numpy arrays.
# return true if the same (exclusive of null vs. empty array)
# false otherwise
# TBD: this is slow for multi-megabyte vlen arrays, needs to be optimized


def ndarray_compare(arr1, arr2):
if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray):
if not isinstance(arr1, np.void) and not isinstance(arr2, np.void):
return arr1 == arr2
if isinstance(arr1, np.void) and not isinstance(arr2, np.void):
if arr1.size == 0 and not arr2:
return True
else:
return False
if not isinstance(arr1, np.void) and isinstance(arr2, np.void):
if not arr1 and arr2.size == 0:
return True
else:
return False
# both np.voids
if arr1.size != arr2.size:
return False

if len(arr1) != len(arr2):
return False

for i in range(len(arr1)):
if not ndarray_compare(arr1[i], arr2[i]):
return False
return True

if isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray):
# same only if arr1 is empty and arr2 is 0
if arr1.size == 0 and not arr2:
return True
else:
return False
if not isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray):
# same only if arr1 is empty and arr2 size is 0
if not arr1 and arr2.size == 0:
return True
else:
return False

# two ndarrays...
if arr1.shape != arr2.shape:
return False
if arr2.dtype != arr2.dtype:
return False

if isVlen(arr1.dtype):
# need to compare element by element

nElements = np.prod(arr1.shape)
arr1 = arr1.reshape((nElements,))
arr2 = arr2.reshape((nElements,))
for i in range(nElements):
if not ndarray_compare(arr1[i], arr2[i]):
return False
return True
else:
# can just us np array_compare
return np.array_equal(arr1, arr2)
59 changes: 1 addition & 58 deletions hsds/util/chunkUtil.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from .. import hsds_logger as log
from .arrayUtil import ndarray_compare

CHUNK_BASE = 16 * 1024 # Multiplier by which chunks are adjusted
CHUNK_MIN = 512 * 1024 # Soft lower limit (512k)
Expand All @@ -8,64 +9,6 @@
PRIMES = [29, 31, 37, 41, 43, 47, 53, 59, 61, 67] # for chunk partitioning


# compare two numpy arrays.
# return true if the same (exclusive of null vs. empty array)
# false otherwise


def ndarray_compare(arr1, arr2):
if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray):
if not isinstance(arr1, np.void) and not isinstance(arr2, np.void):
return arr1 == arr2
if isinstance(arr1, np.void) and not isinstance(arr2, np.void):
if arr1.size == 0 and not arr2:
return True
else:
return False
if not isinstance(arr1, np.void) and isinstance(arr2, np.void):
if not arr1 and arr2.size == 0:
return True
else:
return False
# both np.voids
if arr1.size != arr2.size:
return False

if len(arr1) != len(arr2):
return False

for i in range(len(arr1)):
if not ndarray_compare(arr1[i], arr2[i]):
return False
return True

if isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray):
# same only if arr1 is empty and arr2 is 0
if arr1.size == 0 and not arr2:
return True
else:
return False
if not isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray):
# same only if arr1 is empty and arr2 size is 0
if not arr1 and arr2.size == 0:
return True
else:
return False

# two ndarrays...
if arr1.shape != arr2.shape:
return False
if arr2.dtype != arr2.dtype:
return False
nElements = np.prod(arr1.shape)
arr1 = arr1.reshape((nElements,))
arr2 = arr2.reshape((nElements,))
for i in range(nElements):
if not ndarray_compare(arr1[i], arr2[i]):
return False
return True


"""
Convert list that may contain bytes type elements to list of string elements
Expand Down
8 changes: 1 addition & 7 deletions tests/integ/vlen_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ def testPutVLenInt(self):
self.assertTrue("value" in rspJson)
value = rspJson["value"]
self.assertEqual(len(value), 4)
print("value:", value)
print("data:", data)

for i in range(4):
self.assertEqual(value[i], data[i])

Expand Down Expand Up @@ -222,9 +221,6 @@ def testPutVLenIntBinary(self):
for i in range(count):
self.assertEqual(value[i], test_values[i])

print("data:", data)
print("arr:", arr)

# read back a selection
params = {"select": "[2:3]"}
rsp = self.session.get(req, headers=headers, params=params)
Expand Down Expand Up @@ -295,8 +291,6 @@ def testPutVLen2DInt(self):
self.assertTrue("value" in rspJson)
value = rspJson["value"]
self.assertEqual(len(value), nrow)
print("value:", value)
print("data:", data)

for i in range(nrow):
for j in range(ncol):
Expand Down
35 changes: 32 additions & 3 deletions tests/unit/array_util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
arrayToBytes,
bytesToArray,
getByteArraySize,
IndexIterator
IndexIterator,
ndarray_compare
)
from hsds.util.hdf5dtype import special_dtype
from hsds.util.hdf5dtype import check_dtype
from hsds.util.hdf5dtype import createDataType
from hsds.util.chunkUtil import ndarray_compare


class ArrayUtilTest(unittest.TestCase):
Expand Down Expand Up @@ -308,7 +308,7 @@ def testToBytes(self):
buffer = arrayToBytes(arr)
self.assertEqual(buffer, arr.tobytes())

# convert back to arry
# convert back to array
arr_copy = bytesToArray(buffer, dt, (3,))
self.assertTrue(ndarray_compare(arr, arr_copy))

Expand Down Expand Up @@ -483,6 +483,35 @@ def testToBytes(self):
arr_copy = bytesToArray(buffer, dt, (4,))
self.assertTrue(ndarray_compare(arr, arr_copy))

def testArrayCompareInt(self):
# Simple array
dt = np.dtype("<i4")
arr1 = np.zeros((1024, 1024), dtype=dt)
arr2 = np.zeros((1024, 1024), dtype=dt)
for _ in range(100):
self.assertTrue(ndarray_compare(arr1, arr2))
arr1[123, 456] = 42
self.assertFalse(ndarray_compare(arr1, arr2))

def testArrayCompareVlenInt(self):
# Vlen array
dt_vint = np.dtype("O", metadata={"vlen": "int32"})
dt = np.dtype([("x", "int32"), ("tag", dt_vint)])
arr1 = np.zeros((1024, 1024), dtype=dt)
arr2 = np.zeros((1024, 1024), dtype=dt)
e1 = (42, np.array((), dtype="int32"))
e2 = (84, np.array((1, 2, 3), dtype="int32"))
arr1[123, 456] = e1
arr2[123, 456] = e1
arr1[888, 999] = e2
arr2[888, 999] = e2

# performance is marginal for this case
for _ in range(1):
self.assertTrue(ndarray_compare(arr1, arr2))
arr2[123, 456] = e2
self.assertFalse(ndarray_compare(arr1, arr2))

def testJsonToBytes(self):
#
# VLEN int
Expand Down

0 comments on commit 80e74e2

Please sign in to comment.