Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Internal Backend Hooks and Behavior Extensions #152

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/hangar/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from .selection import BACKEND_ACCESSOR_MAP
from .selection import backend_decoder
from .selection import backend_from_heuristics
from .selection import is_local_backend
from .selection import backend_opts_from_heuristics
from .selection import parse_user_backend_opts

__all__ = [
'BACKEND_ACCESSOR_MAP', 'backend_decoder', 'backend_from_heuristics',
'is_local_backend', 'backend_opts_from_heuristics', 'parse_user_backend_opts'
'BACKEND_ACCESSOR_MAP', 'backend_decoder', 'is_local_backend',
'backend_opts_from_heuristics', 'parse_user_backend_opts'
]
40 changes: 40 additions & 0 deletions src/hangar/backends/hdf5_00.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,46 @@ def hdf5_00_decode(db_val: bytes) -> HDF5_00_DataHashSpec:
return raw_val


# -------------------------- Filter Heuristics --------------------------------


def hdf5_00_heuristic_filter_opts(prototype: np.ndarray) -> dict:
"""generate default filter options from a prototype array

Parameters
----------
prototype : :class:`numpy.ndarray`
sample array of expected shape and datatype

Returns
-------
dict
mapping containing default filter opts that the hdf5_00 storage manager
will accept.

TODO
----

Do something with the prototype arrays, or get rid of the argument, it's
just taking up space at this point.
"""
opts = {
'default': {
'shuffle': None,
'complib': 'blosc:zstd',
'complevel': 3,
},
'backup': {
'shuffle': 'byte',
'complib': 'lzf',
'complevel': None,
},
}
hdf5BloscAvail = h5py.h5z.filter_avail(32001)
opts = opts['default'] if hdf5BloscAvail else opts['backup']
return opts


# ------------------------- Accessor Object -----------------------------------


Expand Down
40 changes: 40 additions & 0 deletions src/hangar/backends/hdf5_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,46 @@ def hdf5_01_decode(db_val: bytes) -> HDF5_01_DataHashSpec:
return raw_val


# -------------------------- Filter Heuristics --------------------------------


def hdf5_01_heuristic_filter_opts(prototype: np.ndarray) -> dict:
"""generate default filter options from a prototype array

Parameters
----------
prototype : :class:`numpy.ndarray`
sample array of expected shape and datatype

Returns
-------
dict
mapping containing default filter opts that the hdf5_00 storage manager
will accept.

TODO
----

Do something with the prototype arrays, or get rid of the argument, it's
just taking up space at this point.
"""
opts = {
'default': {
'shuffle': 'byte',
'complib': 'blosc:lz4hc',
'complevel': 5,
},
'backup': {
'shuffle': 'byte',
'complib': 'lzf',
'complevel': None,
},
}
hdf5BloscAvail = h5py.h5z.filter_avail(32001)
opts = opts['default'] if hdf5BloscAvail else opts['backup']
return opts


# ------------------------- Accessor Object -----------------------------------


Expand Down
27 changes: 27 additions & 0 deletions src/hangar/backends/numpy_10.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,33 @@ def numpy_10_decode(db_val: bytes) -> NUMPY_10_DataHashSpec:
return raw_val


# -------------------------- Filter Heuristics --------------------------------


def numpy_10_heuristic_filter_opts(prototype: np.ndarray) -> dict:
"""generate default filter options from a prototype array

Parameters
----------
prototype : :class:`numpy.ndarray`
sample array of expected shape and datatype

Returns
-------
dict
mapping containing default filter opts that the numpy_10 storage manager
will accept.

TODO
----
* Implement at rest compression of the memmap file? Gzip or something?

* Do something with the prototype arrays, or get rid of the argument, it's
just taking up space at this point.
"""
opts = {}
return opts

# ------------------------- Accessor Object -----------------------------------


Expand Down
27 changes: 27 additions & 0 deletions src/hangar/backends/remote_50.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,33 @@ def remote_50_decode(db_val: bytes) -> REMOTE_50_DataHashSpec:
return raw_val


# -------------------------- Filter Heuristics --------------------------------


def remote_50_heuristic_filter_opts(prototype: np.ndarray) -> dict:
"""generate default filter options from a prototype array

Parameters
----------
prototype : :class:`numpy.ndarray`
sample array of expected shape and datatype

Returns
-------
dict
mapping containing default filter opts that the remote_50 storage manager
will accept.

TODO
----
* Is this even necessary? This method doesn't really even store anything,
it has no use for filter opts, but I'm including it now just to symetric
across all the backends.
"""
opts = {}
return opts


# ------------------------- Accessor Object -----------------------------------


Expand Down
103 changes: 46 additions & 57 deletions src/hangar/backends/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,27 @@
reaching out to the Hangar core development team so we can guide you through the
process.
"""
from typing import Dict, Union, Callable, Mapping, NamedTuple, Optional
from typing import Callable, Dict, Mapping, NamedTuple, Optional, Union

import numpy as np
import pkg_resources

from .hdf5_00 import HDF5_00_FileHandles, hdf5_00_decode, HDF5_00_DataHashSpec
from .hdf5_01 import HDF5_01_FileHandles, hdf5_01_decode, HDF5_01_DataHashSpec
from .numpy_10 import NUMPY_10_FileHandles, numpy_10_decode, NUMPY_10_DataHashSpec
from .remote_50 import REMOTE_50_Handler, remote_50_decode, REMOTE_50_DataHashSpec
from .hdf5_00 import (HDF5_00_DataHashSpec, HDF5_00_FileHandles,
hdf5_00_decode, hdf5_00_heuristic_filter_opts)
from .hdf5_01 import (HDF5_01_DataHashSpec, HDF5_01_FileHandles,
hdf5_01_decode, hdf5_01_heuristic_filter_opts)
from .numpy_10 import (NUMPY_10_DataHashSpec, NUMPY_10_FileHandles,
numpy_10_decode, numpy_10_heuristic_filter_opts)
from .remote_50 import (REMOTE_50_DataHashSpec, REMOTE_50_Handler,
remote_50_decode, remote_50_heuristic_filter_opts)


# -------------------------- Parser Types and Mapping -------------------------
custom_backends = {}
for entry_point in pkg_resources.iter_entry_points('hangar.backends'):
custom_backends[entry_point.name] = entry_point.load()


# -------------------------- Parser Types and Mapping -------------------------

_DataHashSpecs = Union[
HDF5_00_DataHashSpec,
Expand All @@ -103,7 +112,6 @@
REMOTE_50_DataHashSpec]

_ParserMap = Mapping[bytes, Callable[[bytes], _DataHashSpecs]]

BACKEND_DECODER_MAP: _ParserMap = {
# LOCALS -> [00:50]
b'00': hdf5_00_decode,
Expand All @@ -122,19 +130,32 @@
_BeAccessors = Union[HDF5_00_FileHandles, HDF5_01_FileHandles,
NUMPY_10_FileHandles, REMOTE_50_Handler]
_AccessorMap = Dict[str, _BeAccessors]

BACKEND_ACCESSOR_MAP: _AccessorMap = {
# LOCALS -> [0:50]
'00': HDF5_00_FileHandles,
'01': HDF5_01_FileHandles,
'10': NUMPY_10_FileHandles,
'20': None, # tiledb_20 - Reserved
# REMOTES -> [50:100]
'50': REMOTE_50_Handler,
'60': None, # url_60 - Reserved
}


BACKEND_HEURISTIC_FILTER_OPTS_MAP = {
'00': hdf5_00_heuristic_filter_opts,
'01': hdf5_01_heuristic_filter_opts,
'10': numpy_10_heuristic_filter_opts,
'50': remote_50_heuristic_filter_opts,
}


for k, v in custom_backends.items():
fmt_code = getattr(v, '_FmtCode')
_DataHashSpecs = Union[_DataHashSpecs, getattr(v, f'{k.upper()}_DataHashSpec')]
BACKEND_DECODER_MAP[fmt_code.encode()] = getattr(v, f'{k.lower()}_decode')
BACKEND_HEURISTIC_FILTER_OPTS_MAP[fmt_code] = getattr(v, f'{k.lower()}_heuristic_filter_opts')
BACKEND_ACCESSOR_MAP[f'{fmt_code}'] = getattr(v, f'{k.upper()}_FileHandles')


# ------------------------ Selector Functions ---------------------------------


Expand Down Expand Up @@ -164,9 +185,9 @@ def backend_decoder(db_val: bytes) -> _DataHashSpecs:
BackendOpts = NamedTuple('BackendOpts', [('backend', str), ('opts', dict)])


def backend_from_heuristics(array: np.ndarray,
named_samples: bool,
variable_shape: bool) -> str:
def _backend_from_proto_heuristics(array: np.ndarray,
named_samples: bool,
variable_shape: bool) -> str:
"""Given a prototype array, attempt to select the appropriate backend.

Parameters
Expand All @@ -185,7 +206,10 @@ def backend_from_heuristics(array: np.ndarray,

TODO
----
Configuration of this entire module as the available backends fill out.
* Need to have each backend report some type of score based on the array
prototype, otherwise this is going to be a mess.
* At the current implemention, this will never actually pick one of the
custom backends / heuristics.
"""
# uncompressed numpy memmap data is most appropriate for data whose shape is
# likely small tabular row data (CSV or such...)
Expand Down Expand Up @@ -243,45 +267,10 @@ def backend_opts_from_heuristics(backend: str,
In the current implementation, the `array` parameter is unused. Either come
up with a use or remove it from the parameter list.
"""
if backend == '10':
opts = {}
elif backend == '00':
import h5py
opts = {
'default': {
'shuffle': None,
'complib': 'blosc:zstd',
'complevel': 3,
},
'backup': {
'shuffle': 'byte',
'complib': 'lzf',
'complevel': None,
},
}
hdf5BloscAvail = h5py.h5z.filter_avail(32001)
opts = opts['default'] if hdf5BloscAvail else opts['backup']
elif backend == '01':
import h5py
opts = {
'default': {
'shuffle': 'byte',
'complib': 'blosc:lz4hc',
'complevel': 5,
},
'backup': {
'shuffle': 'byte',
'complib': 'lzf',
'complevel': None,
},
}
hdf5BloscAvail = h5py.h5z.filter_avail(32001)
opts = opts['default'] if hdf5BloscAvail else opts['backup']
elif backend == '50':
opts = {}
else:
raise ValueError('Should not have been able to not select backend')

if backend not in BACKEND_HEURISTIC_FILTER_OPTS_MAP:
raise ValueError(f'Selected backend: {backend} is not available.')
func = BACKEND_HEURISTIC_FILTER_OPTS_MAP[backend]
opts = func(array)
return opts


Expand Down Expand Up @@ -338,14 +327,14 @@ def parse_user_backend_opts(backend_opts: Optional[Union[str, dict]],
backend = backend_opts['backend']
opts = {k: v for k, v in backend_opts.items() if k != 'backend'}
elif backend_opts is None:
backend = backend_from_heuristics(array=prototype,
named_samples=named_samples,
variable_shape=variable_shape)
backend = _backend_from_proto_heuristics(array=prototype,
named_samples=named_samples,
variable_shape=variable_shape)
opts = backend_opts_from_heuristics(backend=backend,
array=prototype,
named_samples=named_samples,
variable_shape=variable_shape)
else:
raise ValueError(f'Backend opts value: {backend_opts} is invalid')

return BackendOpts(backend=backend, opts=opts)
return BackendOpts(backend=backend, opts=opts)
9 changes: 6 additions & 3 deletions src/hangar/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,21 @@
import os
import time
from pathlib import Path
import warnings
from pkg_resources import iter_entry_points

import click
import numpy as np

from hangar import Repository, __version__

from .utils import parse_custom_arguments, StrOrIntType
from hangar import Repository
from hangar import __version__
from hangar.cli.utils import with_plugins, parse_custom_arguments, StrOrIntType


pass_repo = click.make_pass_decorator(Repository, ensure=True)


@with_plugins(iter_entry_points('hangar.cli.plugins'))
@click.group(no_args_is_help=True, add_help_option=True, invoke_without_command=True)
@click.version_option(version=__version__, help='display current Hangar Version')
@click.pass_context
Expand Down
Loading