diff --git a/CHANGELOG.md b/CHANGELOG.md index e061b342..0bdf24bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ This project (loosely) adheres to [Semantic Versioning](https://semver.org/spec/ ### Changed * Added module name printout to `schedule_deprecation` +* `ub.hash_data` now supports `datatime.datetime`, `datetime.date`, `decimal.Decimal` objects. ## Version 1.3.6 - Released 2024-06-08 diff --git a/tests/test_hash.py b/tests/test_hash.py index b41c65fa..9e70bf34 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -319,9 +319,7 @@ def test_numpy_random_state(): if np is None: pytest.skip('requires numpy') data = np.random.RandomState(0) - # assert ub.hash_data(data).startswith('ujsidscotcycsqwnkxgbsxkcedplzvytmfmr') assert ub.hash_data(data, hasher='sha512', types=True, base='abc').startswith('snkngbxghabesvowzalqtvdvjtvslmxve') - # _hashable_sequence(data) def test_uuid(): @@ -335,6 +333,61 @@ def test_uuid(): 'the hash should be equal when ignoring types') +def test_decimal(): + import decimal + data = decimal.Decimal('3.1415') + sequence = b''.join(_hashable_sequence(data, types=True)) + assert sequence == b'DECIMAL_[_INT\x00_,__[_INT\x03_,_INT\x01_,_INT\x04_,_INT\x01_,_INT\x05_,__]_INT\xfc_,__]_' + assert ub.hash_data(data, types=True, base='abc', hasher='sha512').startswith('oquwtvtrsytm') + assert ub.hash_data(data.as_tuple(), types=True) != ub.hash_data(data, types=True), ( + 'the fact that it is a Decimal should reflect in the hash') + assert ub.hash_data(data.as_tuple(), types=True) == ub.hash_data(data, types=False), ( + 'it is a quirk of our hashable extensions that an a typed decimal ' + 'tuple will be the same as an untyped decimal. ' + 'It is ok to break this test if we refactor to fix issues in ' + 'hashable extensions' + ) + sequence1 = b''.join(_hashable_sequence(data, types=True)) + sequence2 = b''.join(_hashable_sequence(data, types=False)) + sequence3 = b''.join(_hashable_sequence(data.as_tuple(), types=True)) + sequence4 = b''.join(_hashable_sequence(data.as_tuple(), types=False)) + assert sequence1 != sequence2, 'quirky test' + assert sequence2 == sequence3, 'quirky test' + assert sequence4 != sequence3, 'quirky test' + + +def test_datetime(): + import datetime as datetime_mod + data = datetime_mod.datetime(2101, 1, 1) + sequence = b''.join(_hashable_sequence(data, types=True)) + assert sequence == b'DATETIME_[_INT\x085_,_INT\x01_,_INT\x01_,_INT\x00_,_INT\x00_,_INT\x00_,_INT\x05_,_INT\x01_,_INT\xff_,__]_' + assert ub.hash_data(data, types=True, base='abc', hasher='sha512').startswith('fwjyfdtgcdasv') + assert ub.hash_data(data.timetuple(), types=True) != ub.hash_data(data, types=True), ( + 'the fact that it is a Decimal should reflect in the hash') + assert ub.hash_data(data.timetuple(), types=True) == ub.hash_data(data, types=False), ( + 'it is a quirk of our hashable extensions that an a typed datetime ' + 'tuple will be the same as an untyped decimal. ' + 'It is ok to break this test if we refactor to fix issues in ' + 'hashable extensions' + ) + + +def test_date(): + import datetime as datetime_mod + data = datetime_mod.date(2101, 1, 1) + sequence = b''.join(_hashable_sequence(data, types=True)) + assert sequence == b'DATE_[_INT\x085_,_INT\x01_,_INT\x01_,_INT\x00_,_INT\x00_,_INT\x00_,_INT\x05_,_INT\x01_,_INT\xff_,__]_' + assert ub.hash_data(data, types=True, base='abc', hasher='sha512').startswith('dlahlcoqypecc') + assert ub.hash_data(data.timetuple(), types=True) != ub.hash_data(data, types=True), ( + 'the fact that it is a Decimal should reflect in the hash') + assert ub.hash_data(data.timetuple(), types=True) == ub.hash_data(data, types=False), ( + 'it is a quirk of our hashable extensions that an a typed date' + 'tuple will be the same as an untyped decimal. ' + 'It is ok to break this test if we refactor to fix issues in ' + 'hashable extensions' + ) + + def test_hash_data_custom_base(): data = 1 # A larger base means the string can be shorter diff --git a/ubelt/util_hash.py b/ubelt/util_hash.py index 2c99f5ea..9616f7e0 100644 --- a/ubelt/util_hash.py +++ b/ubelt/util_hash.py @@ -123,6 +123,9 @@ def b(s): DEFAULT_HASHER = hashlib.sha512 # type: Callable +# This controls if types are used when generating hashable sequences for more +# complex objects. Currently there is no way for the user to control this, and +# that might need to addressed, but it will require some thought. _COMPATIBLE_HASHABLE_SEQUENCE_TYPES_DEFAULT = True @@ -558,9 +561,12 @@ def lookup(self, data): # of strictly using this registry. hash_func = self._hash_dispatch.dispatch(query_hash_type) if getattr(hash_func, '__is_base__', False): - raise TypeError( - 'No registered hash func for hashable type={!r}'.format( - query_hash_type)) + base_msg = f'No registered hash func for hashable type={query_hash_type!r}' + try: + msg = f'{base_msg} with mro: {query_hash_type.__mro__}' + except AttributeError: + msg = base_msg + raise TypeError(msg) return hash_func def add_iterable_check(self, func): @@ -726,9 +732,14 @@ def _register_builtin_class_extensions(self): cc21b9fa bd1cabd0 """ + # TODO: can we only register a stdlib class if we need it? + # Some of the stdlib modules dont need to be imported and + # cause extra import time overhead. import uuid import pathlib import numbers + import decimal + import datetime as datetime_mod @self.register(numbers.Integral) def _convert_numpy_int(data): @@ -738,6 +749,39 @@ def _convert_numpy_int(data): def _convert_numpy_float(data): return _convert_to_hashable(float(data), extensions=self) + @self.register(decimal.Decimal) + def _convert_decimal(data): + _hashable_sequence + seq = _hashable_sequence( + data.as_tuple(), + extensions=self, + types=_COMPATIBLE_HASHABLE_SEQUENCE_TYPES_DEFAULT) + hashable = b''.join(seq) + prefix = b'DECIMAL' + return prefix, hashable + + @self.register(datetime_mod.date) + def _convert_date(data): + _hashable_sequence + seq = _hashable_sequence( + data.timetuple(), + extensions=self, + types=_COMPATIBLE_HASHABLE_SEQUENCE_TYPES_DEFAULT) + hashable = b''.join(seq) + prefix = b'DATE' + return prefix, hashable + + @self.register(datetime_mod.datetime) + def _convert_datetime(data): + _hashable_sequence + seq = _hashable_sequence( + data.timetuple(), + extensions=self, + types=_COMPATIBLE_HASHABLE_SEQUENCE_TYPES_DEFAULT) + hashable = b''.join(seq) + prefix = b'DATETIME' + return prefix, hashable + @self.register(uuid.UUID) def _convert_uuid(data): hashable = data.bytes