From 6f64e563d8671e39fa211b7e69776ba1dcd80b2f Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Tue, 4 Aug 2020 14:54:29 -0400 Subject: [PATCH 1/7] mkdocs --- AUTHORS.md | 7 + AUTHORS.rst | 7 - CHANGELOG.md | 447 ++++++++++++++ CHANGELOG.rst | 374 ----------- CODE_OF_CONDUCT.md | 88 +++ CODE_OF_CONDUCT.rst | 86 --- CONTRIBUTING.md | 101 +++ CONTRIBUTING.rst | 99 --- README.md | 141 +++++ README.rst | 160 ----- docs/README.md | 141 +++++ docs/api.md | 75 +++ docs/api.rst | 140 ----- docs/authors.md | 7 + docs/authors.rst | 1 - docs/backends.rst | 31 - docs/backends/backends.md | 15 + docs/backends/hdf5_00.md | 4 + docs/backends/hdf5_00.rst | 4 - docs/backends/{hdf5_01.rst => hdf5_01.md} | 2 +- docs/backends/{lmdb_30.rst => lmdb_30.md} | 2 +- docs/backends/{numpy_10.rst => numpy_10.md} | 2 +- docs/backends/{remote_50.rst => remote_50.md} | 2 +- docs/benchmarking.md | 139 +++++ docs/benchmarking.rst | 1 - docs/changelog.md | 447 ++++++++++++++ docs/changelog.rst | 1 - docs/cli.md | 12 + docs/cli.rst | 13 - docs/codeofconduct.md | 87 +++ docs/codeofconduct.rst | 3 - docs/concepts.md | 582 ++++++++++++++++++ docs/concepts.rst | 581 ----------------- docs/conf.py | 126 ---- docs/contributing.md | 94 +++ docs/contributing.rst | 1 - docs/contributingindex.rst | 12 - docs/design.md | 317 ++++++++++ docs/design.rst | 317 ---------- docs/externals.md | 14 + docs/externals.rst | 17 - docs/faq.md | 221 +++++++ docs/faq.rst | 219 ------- docs/index.rst | 27 - docs/installation.md | 56 ++ docs/installation.rst | 65 -- docs/js/custom.js | 114 ++++ docs/js/termynal.js | 264 ++++++++ docs/noindexapi/apiinit.rst | 8 - docs/noindexapi/apiremotefetchdata.rst | 2 - docs/quickstart.md | 10 + docs/quickstart.rst | 11 - docs/readme.rst | 1 - docs/requirements.txt | 8 - docs/spelling_wordlist.txt | 11 - docs/stylesheets/extra.css | 38 ++ docs/stylesheets/termynal.css | 108 ++++ docs/tutorial.rst | 16 - mkdocs.yml | 83 +++ src/hangar/bulk_importer.py | 38 +- 60 files changed, 3635 insertions(+), 2365 deletions(-) create mode 100644 AUTHORS.md delete mode 100644 AUTHORS.rst create mode 100644 CHANGELOG.md delete mode 100644 CHANGELOG.rst create mode 100644 CODE_OF_CONDUCT.md delete mode 100644 CODE_OF_CONDUCT.rst create mode 100644 CONTRIBUTING.md delete mode 100644 CONTRIBUTING.rst create mode 100644 README.md delete mode 100644 README.rst create mode 100644 docs/README.md create mode 100644 docs/api.md delete mode 100644 docs/api.rst create mode 100644 docs/authors.md delete mode 100644 docs/authors.rst delete mode 100644 docs/backends.rst create mode 100644 docs/backends/backends.md create mode 100644 docs/backends/hdf5_00.md delete mode 100644 docs/backends/hdf5_00.rst rename docs/backends/{hdf5_01.rst => hdf5_01.md} (63%) rename docs/backends/{lmdb_30.rst => lmdb_30.md} (65%) rename docs/backends/{numpy_10.rst => numpy_10.md} (54%) rename docs/backends/{remote_50.rst => remote_50.md} (59%) create mode 100644 docs/benchmarking.md delete mode 100644 docs/benchmarking.rst create mode 100644 docs/changelog.md delete mode 100644 docs/changelog.rst create mode 100644 docs/cli.md delete mode 100644 docs/cli.rst create mode 100644 docs/codeofconduct.md delete mode 100644 docs/codeofconduct.rst create mode 100644 docs/concepts.md delete mode 100644 docs/concepts.rst delete mode 100644 docs/conf.py create mode 100644 docs/contributing.md delete mode 100644 docs/contributing.rst delete mode 100644 docs/contributingindex.rst create mode 100644 docs/design.md delete mode 100644 docs/design.rst create mode 100644 docs/externals.md delete mode 100644 docs/externals.rst create mode 100644 docs/faq.md delete mode 100644 docs/faq.rst delete mode 100644 docs/index.rst create mode 100644 docs/installation.md delete mode 100644 docs/installation.rst create mode 100644 docs/js/custom.js create mode 100644 docs/js/termynal.js delete mode 100644 docs/noindexapi/apiinit.rst delete mode 100644 docs/noindexapi/apiremotefetchdata.rst create mode 100644 docs/quickstart.md delete mode 100644 docs/quickstart.rst delete mode 100644 docs/readme.rst delete mode 100644 docs/requirements.txt delete mode 100644 docs/spelling_wordlist.txt create mode 100644 docs/stylesheets/extra.css create mode 100644 docs/stylesheets/termynal.css delete mode 100644 docs/tutorial.rst create mode 100644 mkdocs.yml diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 00000000..0a9625b5 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,7 @@ +Authors +======= + +- Richard Izzo - +- Luca Antiga - +- Sherin Thomas - +- Alessia Marcolini - diff --git a/AUTHORS.rst b/AUTHORS.rst deleted file mode 100644 index b50d3836..00000000 --- a/AUTHORS.rst +++ /dev/null @@ -1,7 +0,0 @@ -Authors -======= - -* Richard Izzo - rick@tensorwerk.com -* Luca Antiga - luca@tensorwerk.com -* Sherin Thomas - sherin@tensorwerk.com -* Alessia Marcolini - alessia@tensorwerk.com \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..2fe1de5a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,447 @@ +Change Log +========== + +[v0.5.2](https://github.com/tensorwerk/hangar-py/compare/v0.5.1...v0.5.2) (2020-05-08) +-------------------------------------------------------------------------------------- + +### New Features + +- New column data type supporting arbitrary `bytes` data. + ([\#198](https://github.com/tensorwerk/hangar-py/pull/198)) + [\@rlizzo](https://github.com/rlizzo) + +### Improvements + +- `str` typed columns can now accept data containing any unicode + code-point. In prior releases data containing any `non-ascii` + character could not be written to this column type. + ([\#198](https://github.com/tensorwerk/hangar-py/pull/198)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Fixed issue where `str` and (newly added) `bytes` column data could + not be fetched / pushed between a local client repository and remote + server. ([\#198](https://github.com/tensorwerk/hangar-py/pull/198)) + [\@rlizzo](https://github.com/rlizzo) + +[v0.5.1](https://github.com/tensorwerk/hangar-py/compare/v0.5.0...v0.5.1) (2020-04-05) +-------------------------------------------------------------------------------------- + +### BugFixes + +- Fixed issue where importing `make_torch_dataloader` or + `make_tf_dataloader` under python 3.6 Would raise a `NameError` + irrigardless of if the package is installed. + ([\#196](https://github.com/tensorwerk/hangar-py/pull/196)) + [\@rlizzo](https://github.com/rlizzo) + +[v0.5.0](https://github.com/tensorwerk/hangar-py/compare/v0.4.0...v0.5.0) (2020-04-4) +------------------------------------------------------------------------------------- + +### Improvements + +- Python 3.8 is now fully supported. + ([\#193](https://github.com/tensorwerk/hangar-py/pull/193)) + [\@rlizzo](https://github.com/rlizzo) +- Major backend overhaul which defines column layouts and data types + in the same interchangable / extensable manner as storage backends. + This will allow rapid development of new layouts and data type + support as new use cases are discovered by the community. + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- Column and backend classes are now fully serializable (pickleable) + for `read-only` checkouts. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Modularized internal structure of API classes to easily allow new + columnn layouts / data types to be added in the future. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Improved type / value checking of manual specification for column + `backend` and `backend_options`. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Standardized column data access API to follow python standard + library `dict` methods API. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Memory usage of arrayset checkouts has been reduced by \~70% by + using C-structs for allocating sample record locating info. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- Read times from the `HDF5_00` and `HDF5_01` backend have been + reduced by 33-38% (or more for arraysets with many samples) by + eliminating redundant computation of chunked storage B-Tree. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- Commit times and checkout times have been reduced by 11-18% by + optimizing record parsing and memory allocation. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) + +### New Features + +- Added `str` type column with same behavior as `ndarray` column + (supporting both single-level and nested layouts) added to replace + functionality of removed `metadata` container. + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- New backend based on `LMDB` has been added (specifier of `lmdb_30`). + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- Added `.diff()` method to `Repository` class to enable diffing + changes between any pair of commits / branches without needing to + open the diff base in a checkout. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) +- New CLI command `hangar diff` which reports a summary view of + changes made between any pair of commits / branches. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) +- Added `.log()` method to `Checkout` objects so graphical commit + graph or machine readable commit details / DAG can be queried when + operating on a particular commit. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) +- \"string\" type columns now supported alongside \"ndarray\" column + type. ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- New \"column\" API, which replaces \"arrayset\" name. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Arraysets can now contain \"nested subsamples\" under a common + sample key. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- New API to add and remove samples from and arrayset. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- Added `repo.size_nbytes` and `repo.size_human` to report disk usage + of a repository on disk. + ([\#174](https://github.com/tensorwerk/hangar-py/pull/174)) + [\@rlizzo](https://github.com/rlizzo) +- Added method to traverse the entire repository history and + cryptographically verify integrity. + ([\#173](https://github.com/tensorwerk/hangar-py/pull/173)) + [\@rlizzo](https://github.com/rlizzo) + +### Changes + +- Argument syntax of `__getitem__()` and `get()` methods of + `ReaderCheckout` and `WriterCheckout` classes. The new format + supports handeling arbitrary arguments specific to retrieval of data + from any column type. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) + +### Removed + +- `metadata` container for `str` typed data has been completly + removed. It is replaced by a highly extensible and much more + user-friendly `str` typed column. + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- `__setitem__()` method in `WriterCheckout` objects. Writing data to + columns via a checkout object is no longer supported. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Backend data stores no longer use file symlinks, improving + compatibility with some types file systems. + ([\#171](https://github.com/tensorwerk/hangar-py/pull/171)) + [\@rlizzo](https://github.com/rlizzo) +- All arrayset types (\"flat\" and \"nested subsamples\") and backend + readers can now be pickled \-- for parallel processing \-- in a + read-only checkout. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) + +### Breaking changes + +- New backend record serialization format is incompatible with + repositories written in version 0.4 or earlier. +- New arrayset API is incompatible with Hangar API in version 0.4 or + earlier. + +[v0.4.0](https://github.com/tensorwerk/hangar-py/compare/v0.3.0...v0.4.0) (2019-11-21) +-------------------------------------------------------------------------------------- + +### New Features + +- Added ability to delete branch names/pointers from a local + repository via both API and CLI. + ([\#128](https://github.com/tensorwerk/hangar-py/pull/128)) + [\@rlizzo](https://github.com/rlizzo) +- Added `local` keyword arg to arrayset key/value iterators to return + only locally available samples + ([\#131](https://github.com/tensorwerk/hangar-py/pull/131)) + [\@rlizzo](https://github.com/rlizzo) +- Ability to change the backend storage format and options applied to + an `arrayset` after initialization. + ([\#133](https://github.com/tensorwerk/hangar-py/pull/133)) + [\@rlizzo](https://github.com/rlizzo) +- Added blosc compression to HDF5 backend by default on PyPi + installations. + ([\#146](https://github.com/tensorwerk/hangar-py/pull/146)) + [\@rlizzo](https://github.com/rlizzo) +- Added Benchmarking Suite to Test for Performance Regressions in PRs. + ([\#155](https://github.com/tensorwerk/hangar-py/pull/155)) + [\@rlizzo](https://github.com/rlizzo) +- Added new backend optimized to increase speeds for fixed size + arrayset access. + ([\#160](https://github.com/tensorwerk/hangar-py/pull/160)) + [\@rlizzo](https://github.com/rlizzo) + +### Improvements + +- Removed `msgpack` and `pyyaml` dependencies. Cleaned up and improved + remote client/server code. + ([\#130](https://github.com/tensorwerk/hangar-py/pull/130)) + [\@rlizzo](https://github.com/rlizzo) +- Multiprocess Torch DataLoaders allowed on Linux and MacOS. + ([\#144](https://github.com/tensorwerk/hangar-py/pull/144)) + [\@rlizzo](https://github.com/rlizzo) +- Added CLI options `commit`, `checkout`, `arrayset create`, & + `arrayset remove`. + ([\#150](https://github.com/tensorwerk/hangar-py/pull/150)) + [\@rlizzo](https://github.com/rlizzo) +- Plugin system revamp. + ([\#134](https://github.com/tensorwerk/hangar-py/pull/134)) + [\@hhsecond](https://github.com/hhsecond) +- Documentation Improvements and Typo-Fixes. + ([\#156](https://github.com/tensorwerk/hangar-py/pull/156)) + [\@alessiamarcolini](https://github.com/alessiamarcolini) +- Removed implicit removal of arrayset schema from checkout if every + sample was removed from arrayset. This could potentially result in + dangling accessors which may or may not self-destruct (as expected) + in certain edge-cases. + ([\#159](https://github.com/tensorwerk/hangar-py/pull/159)) + [\@rlizzo](https://github.com/rlizzo) +- Added type codes to hash digests so that calculation function can be + updated in the future without breaking repos written in previous + Hangar versions. + ([\#165](https://github.com/tensorwerk/hangar-py/pull/165)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Programatic access to repository log contents now returns branch + heads alongside other log info. + ([\#125](https://github.com/tensorwerk/hangar-py/pull/125)) + [\@rlizzo](https://github.com/rlizzo) +- Fixed minor bug in types of values allowed for `Arrayset` names vs + `Sample` names. + ([\#151](https://github.com/tensorwerk/hangar-py/pull/151)) + [\@rlizzo](https://github.com/rlizzo) +- Fixed issue where using checkout object to access a sample in + multiple arraysets would try to create a `namedtuple` instance with + invalid field names. Now incompatible field names are automatically + renamed with their positional index. + ([\#161](https://github.com/tensorwerk/hangar-py/pull/161)) + [\@rlizzo](https://github.com/rlizzo) +- Explicitly raise error if `commit` argument is set while checking + out a repository with `write=True`. + ([\#166](https://github.com/tensorwerk/hangar-py/pull/166)) + [\@rlizzo](https://github.com/rlizzo) + +### Breaking changes + +- New commit reference serialization format is incompatible with + repositories written in version 0.3.0 or earlier. + +[v0.3.0](https://github.com/tensorwerk/hangar-py/compare/v0.2.0...v0.3.0) (2019-09-10) +-------------------------------------------------------------------------------------- + +### New Features + +- API addition allowing reading and writing arrayset data from a + checkout object directly. + ([\#115](https://github.com/tensorwerk/hangar-py/pull/115)) + [\@rlizzo](https://github.com/rlizzo) +- Data importer, exporters, and viewers via CLI for common file + formats. Includes plugin system for easy extensibility in the + future. ([\#103](https://github.com/tensorwerk/hangar-py/pull/103)) + ([\@rlizzo](https://github.com/rlizzo), + [\@hhsecond](https://github.com/hhsecond)) + +### Improvements + +- Added tutorial on working with remote data. + ([\#113](https://github.com/tensorwerk/hangar-py/pull/113)) + [\@rlizzo](https://github.com/rlizzo) +- Added Tutorial on Tensorflow and PyTorch Dataloaders. + ([\#117](https://github.com/tensorwerk/hangar-py/pull/117)) + [\@hhsecond](https://github.com/hhsecond) +- Large performance improvement to diff/merge algorithm (\~30x + previous). + ([\#112](https://github.com/tensorwerk/hangar-py/pull/112)) + [\@rlizzo](https://github.com/rlizzo) +- New commit hash algorithm which is much more reproducible in the + long term. + ([\#120](https://github.com/tensorwerk/hangar-py/pull/120)) + [\@rlizzo](https://github.com/rlizzo) +- HDF5 backend updated to increase speed of reading/writing variable + sized dataset compressed chunks + ([\#120](https://github.com/tensorwerk/hangar-py/pull/120)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Fixed ML Dataloaders errors for a number of edge cases surrounding + partial-remote data and non-common keys. + ([\#110](https://github.com/tensorwerk/hangar-py/pull/110)) ( + [\@hhsecond](https://github.com/hhsecond), + [\@rlizzo](https://github.com/rlizzo)) + +### Breaking changes + +- New commit hash algorithm is incompatible with repositories written + in version 0.2.0 or earlier + +[v0.2.0](https://github.com/tensorwerk/hangar-py/compare/v0.1.1...v0.2.0) (2019-08-09) +-------------------------------------------------------------------------------------- + +### New Features + +- Numpy memory-mapped array file backend added. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Remote server data backend added. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Selection heuristics to determine appropriate backend from arrayset + schema. ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Partial remote clones and fetch operations now fully supported. + ([\#85](https://github.com/tensorwerk/hangar-py/pull/85)) + [\@rlizzo](https://github.com/rlizzo) +- CLI has been placed under test coverage, added interface usage to + docs. ([\#85](https://github.com/tensorwerk/hangar-py/pull/85)) + [\@rlizzo](https://github.com/rlizzo) +- TensorFlow and PyTorch Machine Learning Dataloader Methods + (*Experimental Release*). + ([\#91](https://github.com/tensorwerk/hangar-py/pull/91)) lead: + [\@hhsecond](https://github.com/hhsecond), co-author: + [\@rlizzo](https://github.com/rlizzo), reviewed by: + [\@elistevens](https://github.com/elistevens) + +### Improvements + +- Record format versioning and standardization so to not break + backwards compatibility in the future. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Backend addition and update developer protocols and documentation. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Read-only checkout arrayset sample `get` methods now are multithread + and multiprocess safe. + ([\#84](https://github.com/tensorwerk/hangar-py/pull/84)) + [\@rlizzo](https://github.com/rlizzo) +- Read-only checkout metadata sample `get` methods are thread safe if + used within a context manager. + ([\#101](https://github.com/tensorwerk/hangar-py/pull/101)) + [\@rlizzo](https://github.com/rlizzo) +- Samples can be assigned integer names in addition to `string` names. + ([\#89](https://github.com/tensorwerk/hangar-py/pull/89)) + [\@rlizzo](https://github.com/rlizzo) +- Forgetting to close a `write-enabled` checkout before terminating + the python process will close the checkout automatically for many + situations. + ([\#101](https://github.com/tensorwerk/hangar-py/pull/101)) + [\@rlizzo](https://github.com/rlizzo) +- Repository software version compatability methods added to ensure + upgrade paths in the future. + ([\#101](https://github.com/tensorwerk/hangar-py/pull/101)) + [\@rlizzo](https://github.com/rlizzo) +- Many tests added (including support for Mac OSX on Travis-CI). lead: + [\@rlizzo](https://github.com/rlizzo), co-author: + [\@hhsecond](https://github.com/hhsecond) + +### Bug Fixes + +- Diff results for fast forward merges now returns sensible results. + ([\#77](https://github.com/tensorwerk/hangar-py/pull/77)) + [\@rlizzo](https://github.com/rlizzo) +- Many type annotations added, and developer documentation improved. + [\@hhsecond](https://github.com/hhsecond) & + [\@rlizzo](https://github.com/rlizzo) + +### Breaking changes + +- Renamed all references to `datasets` in the API / world-view to + `arraysets`. +- These are backwards incompatible changes. For all versions \> 0.2, + repository upgrade utilities will be provided if breaking changes + occur. + +[v0.1.1](https://github.com/tensorwerk/hangar-py/compare/v0.1.0...v0.1.1) (2019-05-24) +-------------------------------------------------------------------------------------- + +### Bug Fixes + +- Fixed typo in README which was uploaded to PyPi + +[v0.1.0](https://github.com/tensorwerk/hangar-py/compare/v0.0.0...v0.1.0) (2019-05-24) +-------------------------------------------------------------------------------------- + +### New Features + +- Remote client-server config negotiation and administrator + permissions. + ([\#10](https://github.com/tensorwerk/hangar-py/pull/10)) + [\@rlizzo](https://github.com/rlizzo) +- Allow single python process to access multiple repositories + simultaneously. + ([\#20](https://github.com/tensorwerk/hangar-py/pull/20)) + [\@rlizzo](https://github.com/rlizzo) +- Fast-Forward and 3-Way Merge and Diff methods now fully supported + and behaving as expected. + ([\#32](https://github.com/tensorwerk/hangar-py/pull/32)) + [\@rlizzo](https://github.com/rlizzo) + +### Improvements + +- Initial test-case specification. + ([\#14](https://github.com/tensorwerk/hangar-py/pull/14)) + [\@hhsecond](https://github.com/hhsecond) +- Checkout test-case work. + ([\#25](https://github.com/tensorwerk/hangar-py/pull/25)) + [\@hhsecond](https://github.com/hhsecond) +- Metadata test-case work. + ([\#27](https://github.com/tensorwerk/hangar-py/pull/27)) + [\@hhsecond](https://github.com/hhsecond) +- Any potential failure cases raise exceptions instead of silently + returning. ([\#16](https://github.com/tensorwerk/hangar-py/pull/16)) + [\@rlizzo](https://github.com/rlizzo) +- Many usability improvements in a variety of commits. + +### Bug Fixes + +- Ensure references to checkout arrayset or metadata objects cannot + operate after the checkout is closed. + ([\#41](https://github.com/tensorwerk/hangar-py/pull/41)) + [\@rlizzo](https://github.com/rlizzo) +- Sensible exception classes and error messages raised on a variety of + situations (Many commits). [\@hhsecond](https://github.com/hhsecond) + & [\@rlizzo](https://github.com/rlizzo) +- Many minor issues addressed. + +### API Additions + +- Refer to API documentation + ([\#23](https://github.com/tensorwerk/hangar-py/pull/23)) + +### Breaking changes + +- All repositories written with previous versions of Hangar are liable + to break when using this version. Please upgrade versions + immediately. + +[v0.0.0](https://github.com/tensorwerk/hangar-py/commit/2aff3805c66083a7fbb2ebf701ceaf38ac5165c7) (2019-04-15) +-------------------------------------------------------------------------------------------------------------- + +- First Public Release of Hangar! diff --git a/CHANGELOG.rst b/CHANGELOG.rst deleted file mode 100644 index 520b0a3d..00000000 --- a/CHANGELOG.rst +++ /dev/null @@ -1,374 +0,0 @@ -========== -Change Log -========== - - -`v0.5.2`_ (2020-05-08) -====================== - -New Features ------------- - -* New column data type supporting arbitrary ``bytes`` data. - (`#198 `__) `@rlizzo `__ - -Improvements ------------- - -* ``str`` typed columns can now accept data containing any unicode code-point. In prior releases - data containing any ``non-ascii`` character could not be written to this column type. - (`#198 `__) `@rlizzo `__ - - -Bug Fixes ---------- - -* Fixed issue where ``str`` and (newly added) ``bytes`` column data could not be fetched / pushed - between a local client repository and remote server. - (`#198 `__) `@rlizzo `__ - - - -`v0.5.1`_ (2020-04-05) -====================== - -BugFixes --------- - -* Fixed issue where importing ``make_torch_dataloader`` or ``make_tf_dataloader`` under python 3.6 - Would raise a ``NameError`` irrigardless of if the package is installed. - (`#196 `__) `@rlizzo `__ - - -`v0.5.0`_ (2020-04-4) -===================== - -Improvements ------------- - -* Python 3.8 is now fully supported. - (`#193 `__) `@rlizzo `__ -* Major backend overhaul which defines column layouts and data types in the same interchangable - / extensable manner as storage backends. This will allow rapid development of new layouts and - data type support as new use cases are discovered by the community. - (`#184 `__) `@rlizzo `__ -* Column and backend classes are now fully serializable (pickleable) for ``read-only`` checkouts. - (`#180 `__) `@rlizzo `__ -* Modularized internal structure of API classes to easily allow new columnn layouts / data types - to be added in the future. - (`#180 `__) `@rlizzo `__ -* Improved type / value checking of manual specification for column ``backend`` and ``backend_options``. - (`#180 `__) `@rlizzo `__ -* Standardized column data access API to follow python standard library ``dict`` methods API. - (`#180 `__) `@rlizzo `__ -* Memory usage of arrayset checkouts has been reduced by ~70% by using C-structs for allocating - sample record locating info. - (`#179 `__) `@rlizzo `__ -* Read times from the ``HDF5_00`` and ``HDF5_01`` backend have been reduced by 33-38% (or more for - arraysets with many samples) by eliminating redundant computation of chunked storage B-Tree. - (`#179 `__) `@rlizzo `__ -* Commit times and checkout times have been reduced by 11-18% by optimizing record parsing and - memory allocation. - (`#179 `__) `@rlizzo `__ - - -New Features ------------- - -* Added ``str`` type column with same behavior as ``ndarray`` column (supporting both - single-level and nested layouts) added to replace functionality of removed ``metadata`` container. - (`#184 `__) `@rlizzo `__ -* New backend based on ``LMDB`` has been added (specifier of ``lmdb_30``). - (`#184 `__) `@rlizzo `__ -* Added ``.diff()`` method to ``Repository`` class to enable diffing changes between any pair of - commits / branches without needing to open the diff base in a checkout. - (`#183 `__) `@rlizzo `__ -* New CLI command ``hangar diff`` which reports a summary view of changes made between any pair of - commits / branches. - (`#183 `__) `@rlizzo `__ -* Added ``.log()`` method to ``Checkout`` objects so graphical commit graph or machine readable - commit details / DAG can be queried when operating on a particular commit. - (`#183 `__) `@rlizzo `__ -* "string" type columns now supported alongside "ndarray" column type. - (`#180 `__) `@rlizzo `__ -* New "column" API, which replaces "arrayset" name. - (`#180 `__) `@rlizzo `__ -* Arraysets can now contain "nested subsamples" under a common sample key. - (`#179 `__) `@rlizzo `__ -* New API to add and remove samples from and arrayset. - (`#179 `__) `@rlizzo `__ -* Added ``repo.size_nbytes`` and ``repo.size_human`` to report disk usage of a repository on disk. - (`#174 `__) `@rlizzo `__ -* Added method to traverse the entire repository history and cryptographically verify integrity. - (`#173 `__) `@rlizzo `__ - - -Changes -------- - -* Argument syntax of ``__getitem__()`` and ``get()`` methods of ``ReaderCheckout`` and - ``WriterCheckout`` classes. The new format supports handeling arbitrary arguments specific - to retrieval of data from any column type. - (`#183 `__) `@rlizzo `__ - - -Removed -------- - -* ``metadata`` container for ``str`` typed data has been completly removed. It is replaced by a highly - extensible and much more user-friendly ``str`` typed column. - (`#184 `__) `@rlizzo `__ -* ``__setitem__()`` method in ``WriterCheckout`` objects. Writing data to columns via a checkout object - is no longer supported. - (`#183 `__) `@rlizzo `__ - - -Bug Fixes ---------- - -* Backend data stores no longer use file symlinks, improving compatibility with some types file systems. - (`#171 `__) `@rlizzo `__ -* All arrayset types ("flat" and "nested subsamples") and backend readers can now be pickled -- for parallel - processing -- in a read-only checkout. - (`#179 `__) `@rlizzo `__ - - -Breaking changes ----------------- - -* New backend record serialization format is incompatible with repositories written in version 0.4 or earlier. -* New arrayset API is incompatible with Hangar API in version 0.4 or earlier. - - -`v0.4.0`_ (2019-11-21) -====================== - -New Features ------------- - -* Added ability to delete branch names/pointers from a local repository via both API and CLI. - (`#128 `__) `@rlizzo `__ -* Added ``local`` keyword arg to arrayset key/value iterators to return only locally available samples - (`#131 `__) `@rlizzo `__ -* Ability to change the backend storage format and options applied to an ``arrayset`` after initialization. - (`#133 `__) `@rlizzo `__ -* Added blosc compression to HDF5 backend by default on PyPi installations. - (`#146 `__) `@rlizzo `__ -* Added Benchmarking Suite to Test for Performance Regressions in PRs. - (`#155 `__) `@rlizzo `__ -* Added new backend optimized to increase speeds for fixed size arrayset access. - (`#160 `__) `@rlizzo `__ - - -Improvements ------------- - -* Removed ``msgpack`` and ``pyyaml`` dependencies. Cleaned up and improved remote client/server code. - (`#130 `__) `@rlizzo `__ -* Multiprocess Torch DataLoaders allowed on Linux and MacOS. - (`#144 `__) `@rlizzo `__ -* Added CLI options ``commit``, ``checkout``, ``arrayset create``, & ``arrayset remove``. - (`#150 `__) `@rlizzo `__ -* Plugin system revamp. - (`#134 `__) `@hhsecond `__ -* Documentation Improvements and Typo-Fixes. - (`#156 `__) `@alessiamarcolini `__ -* Removed implicit removal of arrayset schema from checkout if every sample was removed from arrayset. - This could potentially result in dangling accessors which may or may not self-destruct (as expected) - in certain edge-cases. - (`#159 `__) `@rlizzo `__ -* Added type codes to hash digests so that calculation function can be updated in the future without - breaking repos written in previous Hangar versions. - (`#165 `__) `@rlizzo `__ - - -Bug Fixes ---------- - -* Programatic access to repository log contents now returns branch heads alongside other log info. - (`#125 `__) `@rlizzo `__ -* Fixed minor bug in types of values allowed for ``Arrayset`` names vs ``Sample`` names. - (`#151 `__) `@rlizzo `__ -* Fixed issue where using checkout object to access a sample in multiple arraysets would try to create - a ``namedtuple`` instance with invalid field names. Now incompatible field names are automatically - renamed with their positional index. - (`#161 `__) `@rlizzo `__ -* Explicitly raise error if ``commit`` argument is set while checking out a repository with ``write=True``. - (`#166 `__) `@rlizzo `__ - - -Breaking changes ----------------- - -* New commit reference serialization format is incompatible with repositories written in version 0.3.0 or earlier. - - -`v0.3.0`_ (2019-09-10) -====================== - -New Features ------------- - -* API addition allowing reading and writing arrayset data from a checkout object directly. - (`#115 `__) `@rlizzo `__ -* Data importer, exporters, and viewers via CLI for common file formats. Includes plugin system - for easy extensibility in the future. - (`#103 `__) - (`@rlizzo `__, `@hhsecond `__) - -Improvements ------------- - -* Added tutorial on working with remote data. - (`#113 `__) `@rlizzo `__ -* Added Tutorial on Tensorflow and PyTorch Dataloaders. - (`#117 `__) `@hhsecond `__ -* Large performance improvement to diff/merge algorithm (~30x previous). - (`#112 `__) `@rlizzo `__ -* New commit hash algorithm which is much more reproducible in the long term. - (`#120 `__) `@rlizzo `__ -* HDF5 backend updated to increase speed of reading/writing variable sized dataset compressed chunks - (`#120 `__) `@rlizzo `__ - -Bug Fixes ---------- - -* Fixed ML Dataloaders errors for a number of edge cases surrounding partial-remote data and non-common keys. - (`#110 `__) - ( `@hhsecond `__, `@rlizzo `__) - -Breaking changes ----------------- - -* New commit hash algorithm is incompatible with repositories written in version 0.2.0 or earlier - - -`v0.2.0`_ (2019-08-09) -====================== - -New Features ------------- - -* Numpy memory-mapped array file backend added. - (`#70 `__) `@rlizzo `__ -* Remote server data backend added. - (`#70 `__) `@rlizzo `__ -* Selection heuristics to determine appropriate backend from arrayset schema. - (`#70 `__) `@rlizzo `__ -* Partial remote clones and fetch operations now fully supported. - (`#85 `__) `@rlizzo `__ -* CLI has been placed under test coverage, added interface usage to docs. - (`#85 `__) `@rlizzo `__ -* TensorFlow and PyTorch Machine Learning Dataloader Methods (*Experimental Release*). - (`#91 `__) - lead: `@hhsecond `__, co-author: `@rlizzo `__, - reviewed by: `@elistevens `__ - -Improvements ------------- - -* Record format versioning and standardization so to not break backwards compatibility in the future. - (`#70 `__) `@rlizzo `__ -* Backend addition and update developer protocols and documentation. - (`#70 `__) `@rlizzo `__ -* Read-only checkout arrayset sample ``get`` methods now are multithread and multiprocess safe. - (`#84 `__) `@rlizzo `__ -* Read-only checkout metadata sample ``get`` methods are thread safe if used within a context manager. - (`#101 `__) `@rlizzo `__ -* Samples can be assigned integer names in addition to ``string`` names. - (`#89 `__) `@rlizzo `__ -* Forgetting to close a ``write-enabled`` checkout before terminating the python process will close the - checkout automatically for many situations. - (`#101 `__) `@rlizzo `__ -* Repository software version compatability methods added to ensure upgrade paths in the future. - (`#101 `__) `@rlizzo `__ -* Many tests added (including support for Mac OSX on Travis-CI). - lead: `@rlizzo `__, co-author: `@hhsecond `__ - -Bug Fixes ---------- - -* Diff results for fast forward merges now returns sensible results. - (`#77 `__) `@rlizzo `__ -* Many type annotations added, and developer documentation improved. - `@hhsecond `__ & `@rlizzo `__ - -Breaking changes ----------------- - -* Renamed all references to ``datasets`` in the API / world-view to ``arraysets``. -* These are backwards incompatible changes. For all versions > 0.2, repository upgrade utilities will - be provided if breaking changes occur. - - -`v0.1.1`_ (2019-05-24) -====================== - -Bug Fixes ---------- - -* Fixed typo in README which was uploaded to PyPi - - -`v0.1.0`_ (2019-05-24) -====================== - -New Features ------------- - -* Remote client-server config negotiation and administrator permissions. - (`#10 `__) `@rlizzo `__ -* Allow single python process to access multiple repositories simultaneously. - (`#20 `__) `@rlizzo `__ -* Fast-Forward and 3-Way Merge and Diff methods now fully supported and behaving as expected. - (`#32 `__) `@rlizzo `__ - -Improvements ------------- - -* Initial test-case specification. - (`#14 `__) `@hhsecond `__ -* Checkout test-case work. - (`#25 `__) `@hhsecond `__ -* Metadata test-case work. - (`#27 `__) `@hhsecond `__ -* Any potential failure cases raise exceptions instead of silently returning. - (`#16 `__) `@rlizzo `__ -* Many usability improvements in a variety of commits. - - -Bug Fixes ---------- - -* Ensure references to checkout arrayset or metadata objects cannot operate after the checkout is closed. - (`#41 `__) `@rlizzo `__ -* Sensible exception classes and error messages raised on a variety of situations (Many commits). - `@hhsecond `__ & `@rlizzo `__ -* Many minor issues addressed. - -API Additions -------------- - -* Refer to API documentation (`#23 `__) - -Breaking changes ----------------- - -* All repositories written with previous versions of Hangar are liable to break when using this version. Please upgrade versions immediately. - - -`v0.0.0`_ (2019-04-15) -====================== - -* First Public Release of Hangar! - -.. _v0.0.0: https://github.com/tensorwerk/hangar-py/commit/2aff3805c66083a7fbb2ebf701ceaf38ac5165c7 -.. _v0.1.0: https://github.com/tensorwerk/hangar-py/compare/v0.0.0...v0.1.0 -.. _v0.1.1: https://github.com/tensorwerk/hangar-py/compare/v0.1.0...v0.1.1 -.. _v0.2.0: https://github.com/tensorwerk/hangar-py/compare/v0.1.1...v0.2.0 -.. _v0.3.0: https://github.com/tensorwerk/hangar-py/compare/v0.2.0...v0.3.0 -.. _v0.4.0: https://github.com/tensorwerk/hangar-py/compare/v0.3.0...v0.4.0 -.. _v0.5.0: https://github.com/tensorwerk/hangar-py/compare/v0.4.0...v0.5.0 -.. _v0.5.1: https://github.com/tensorwerk/hangar-py/compare/v0.5.0...v0.5.1 -.. _v0.5.2: https://github.com/tensorwerk/hangar-py/compare/v0.5.1...v0.5.2 -.. _In-Progress: https://github.com/tensorwerk/hangar-py/compare/v0.5.2...master diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..b60738fd --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,88 @@ +Contributor Code of Conduct +=========================== + +Our Pledge +---------- + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our +project and our community a harassment-free experience for everyone, +regardless of age, body size, disability, ethnicity, sex +characteristics, gender identity and expression, level of experience, +education, socio-economic status, nationality, personal appearance, +race, religion, or sexual identity and orientation. + +Our Standards +------------- + +Examples of behavior that contributes to creating a positive environment +include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual + attention or advances +- Trolling, insulting/derogatory comments, and personal or political + attacks +- Public or private harassment +- Publishing others\' private information, such as a physical or + electronic address, without explicit permission +- Other conduct which could reasonably be considered inappropriate in + a professional setting + +Our Responsibilities +-------------------- + +Project maintainers are responsible for clarifying the standards of +acceptable behavior and are expected to take appropriate and fair +corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, +or reject comments, commits, code, wiki edits, issues, and other +contributions that are not aligned to this Code of Conduct, or to ban +temporarily or permanently any contributor for other behaviors that they +deem inappropriate, threatening, offensive, or harmful. + +Scope +----- + +This Code of Conduct applies both within project spaces and in public +spaces when an individual is representing the project or its community. +Examples of representing a project or community include using an +official project e-mail address, posting via an official social media +account, or acting as an appointed representative at an online or +offline event. Representation of a project may be further defined and +clarified by project maintainers. + +Enforcement +----------- + +Instances of abusive, harassing, or otherwise unacceptable behavior may +be reported by contacting the project team at +[hangar.info\@tensorwerk.com](hangar.info@tensorwerk.com). All +complaints will be reviewed and investigated and will result in a +response that is deemed necessary and appropriate to the circumstances. +The project team is obligated to maintain confidentiality with regard to +the reporter of an incident. Further details of specific enforcement +policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in +good faith may face temporary or permanent repercussions as determined +by other members of the project\'s leadership. + +Attribution +----------- + +This Code of Conduct is adapted from the [Contributor +Covenant](https://www.contributor-covenant.org) homepage, version 1.4, +available at + + +For answers to common questions about this code of conduct, see + diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst deleted file mode 100644 index d91d401a..00000000 --- a/CODE_OF_CONDUCT.rst +++ /dev/null @@ -1,86 +0,0 @@ -=========================== -Contributor Code of Conduct -=========================== - -Our Pledge ----------- - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to making participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -Our Standards -------------- - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or - advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -Our Responsibilities --------------------- - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -Scope ------ - -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. Examples of -representing a project or community include using an official project e-mail -address, posting via an official social media account, or acting as an appointed -representative at an online or offline event. Representation of a project may be -further defined and clarified by project maintainers. - -Enforcement ------------ - - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at -`hangar.info@tensorwerk.com `__. All complaints will -be reviewed and investigated and will result in a response that is deemed -necessary and appropriate to the circumstances. The project team is obligated to -maintain confidentiality with regard to the reporter of an incident. Further -details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -Attribution ------------ - -This Code of Conduct is adapted from the `Contributor Covenant`_ homepage, version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -.. _Contributor Covenant: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..5e11f210 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,101 @@ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every +little bit helps, and credit will always be given. + +All community members should read and abide by our +`ref-code-of-conduct`{.interpreted-text role="ref"}. + +Bug reports +----------- + +When [reporting a bug](https://github.com/tensorwerk/hangar-py/issues) +please include: + +> - Your operating system name and version. +> - Any details about your local setup that might be helpful in +> troubleshooting. +> - Detailed steps to reproduce the bug. + +Documentation improvements +-------------------------- + +Hangar could always use more documentation, whether as part of the +official Hangar docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Feature requests and feedback +----------------------------- + +The best way to send feedback is to file an issue at +. + +If you are proposing a feature: + +- Explain in detail how it would work. +- Keep the scope as narrow as possible, to make it easier to + implement. +- Remember that this is a volunteer-driven project, and that code + contributions are welcome :) + +Development +----------- + +To set up [hangar-py]{.title-ref} for local development: + +1. Fork [hangar-py](https://github.com/tensorwerk/hangar-py) (look for + the \"Fork\" button). +2. Clone your fork locally: + + git clone git@github.com:your_name_here/hangar-py.git + +3. Create a branch for local development: + + git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +4. When you\'re done making changes, run all the checks, doc builder + and spell checker with + [tox](http://tox.readthedocs.io/en/latest/install.html) one command: + + tox + +5. Commit your changes and push your branch to GitHub: + + git add . + git commit -m "Your detailed description of your changes." + git push origin name-of-your-bugfix-or-feature + +6. Submit a pull request through the GitHub website. + +### Pull Request Guidelines + +If you need some code review or feedback while you\'re developing the +code just make the pull request. + +For merging, you should: + +1. Include passing tests (run `tox`)[^1]. +2. Update documentation when there\'s new API, functionality etc. +3. Add a note to `CHANGELOG.rst` about the changes. +4. Add yourself to `AUTHORS.rst`. + +### Tips + +To run a subset of tests: + + tox -e envname -- pytest -k test_myfeature + +To run all the test environments in *parallel* (you need to +`pip install detox`): + + detox + +[^1]: If you don\'t have all the necessary python versions available + locally you can rely on Travis - it will [run the tests + \]() for + each change you add in the pull request. + + It will be slower though \... diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst deleted file mode 100644 index 8a71aa6d..00000000 --- a/CONTRIBUTING.rst +++ /dev/null @@ -1,99 +0,0 @@ -============ -Contributing -============ - -Contributions are welcome, and they are greatly appreciated! Every -little bit helps, and credit will always be given. - -All community members should read and abide by our :ref:`ref-code-of-conduct`. - -Bug reports -=========== - -When `reporting a bug `_ please include: - - * Your operating system name and version. - * Any details about your local setup that might be helpful in - troubleshooting. - * Detailed steps to reproduce the bug. - -Documentation improvements -========================== - -Hangar could always use more documentation, whether as part of the -official Hangar docs, in docstrings, or even on the web in blog posts, -articles, and such. - -Feature requests and feedback -============================= - -The best way to send feedback is to file an issue at https://github.com/tensorwerk/hangar-py/issues. - -If you are proposing a feature: - -* Explain in detail how it would work. -* Keep the scope as narrow as possible, to make it easier to implement. -* Remember that this is a volunteer-driven project, and that code contributions - are welcome :) - -Development -=========== - -To set up `hangar-py` for local development: - -1. Fork `hangar-py `_ - (look for the "Fork" button). -2. Clone your fork locally:: - - git clone git@github.com:your_name_here/hangar-py.git - -3. Create a branch for local development:: - - git checkout -b name-of-your-bugfix-or-feature - - Now you can make your changes locally. - -4. When you're done making changes, run all the checks, doc builder and spell - checker with `tox `_ one - command:: - - tox - -5. Commit your changes and push your branch to GitHub:: - - git add . - git commit -m "Your detailed description of your changes." - git push origin name-of-your-bugfix-or-feature - -6. Submit a pull request through the GitHub website. - -Pull Request Guidelines ------------------------ - -If you need some code review or feedback while you're developing the code just -make the pull request. - -For merging, you should: - -1. Include passing tests (run ``tox``) [1]_. -2. Update documentation when there's new API, functionality etc. -3. Add a note to ``CHANGELOG.rst`` about the changes. -4. Add yourself to ``AUTHORS.rst``. - -.. [1] If you don't have all the necessary python versions available - locally you can rely on Travis - it will `run the tests - `_ for each change - you add in the pull request. - - It will be slower though ... - -Tips ----- - -To run a subset of tests:: - - tox -e envname -- pytest -k test_myfeature - -To run all the test environments in *parallel* (you need to ``pip install detox``):: - - detox diff --git a/README.md b/README.md new file mode 100644 index 00000000..ebc76a63 --- /dev/null +++ b/README.md @@ -0,0 +1,141 @@ +Overview +======== + ++-----------------------------------+-----------------------------------+ +| docs | | ++-----------------------------------+-----------------------------------+ +| tests | | [![Build Status](https://github | +| | .com/tensorwerk/hangar-py/workflo | +| | ws/Run%20Test%20Suite/badge.svg?b | +| | ranch=master)](https://github.com | +| | /tensorwerk/hangar-py/actions?que | +| | ry=workflow%3A%22Run+Test+Suite%2 | +| | 2+branch%3Amaster+event%3Apush+is | +| | %3Acompleted) | +| | [![Code Coverage](https://codec | +| | ov.io/gh/tensorwerk/hangar-py/bra | +| | nch/master/graph/badge.svg)](http | +| | s://codecov.io/gh/tensorwerk/hang | +| | ar-py) | +| | | [![Language grade: Python](http | +| | s://img.shields.io/lgtm/grade/pyt | +| | hon/g/tensorwerk/hangar-py.svg?lo | +| | go=lgtm&logoWidth=18)](https://lg | +| | tm.com/projects/g/tensorwerk/hang | +| | ar-py/context:python) | ++-----------------------------------+-----------------------------------+ +| package | | [![PyPI Package latest release] | +| | (https://img.shields.io/pypi/v/ha | +| | ngar.svg)](https://pypi.org/proje | +| | ct/hangar) | +| | [![PyPI Wheel](https://img.shie | +| | lds.io/pypi/wheel/hangar.svg)](ht | +| | tps://pypi.org/project/hangar) | +| | [![Conda-Forge Latest Version]( | +| | https://img.shields.io/conda/vn/c | +| | onda-forge/hangar.svg)](https://a | +| | naconda.org/conda-forge/hangar) | +| | | [![Supported versions](https:// | +| | img.shields.io/pypi/pyversions/ha | +| | ngar.svg)](https://pypi.org/proje | +| | ct/hangar) | +| | [![Supported implementations](h | +| | ttps://img.shields.io/pypi/implem | +| | entation/hangar.svg)](https://pyp | +| | i.org/project/hangar) | +| | | [![GitHub license](https://img. | +| | shields.io/github/license/tensorw | +| | erk/hangar-py)](https://github.co | +| | m/tensorwerk/hangar-py/blob/maste | +| | r/LICENSE) | ++-----------------------------------+-----------------------------------+ + +Hangar is version control for tensor data. Commit, branch, merge, +revert, and collaborate in the data-defined software era. + +- Free software: Apache 2.0 license + +What is Hangar? +--------------- + +Hangar is based off the belief that too much time is spent collecting, +managing, and creating home-brewed version control systems for data. At +it\'s core Hangar is designed to solve many of the same problems faced +by traditional code version control system (ie. `Git`), just adapted for +numerical data: + +- Time travel through the historical evolution of a dataset. +- Zero-cost Branching to enable exploratory analysis and collaboration +- Cheap Merging to build datasets over time (with multiple + collaborators) +- Completely abstracted organization and management of data files on + disk +- Ability to only retrieve a small portion of the data (as needed) + while still maintaining complete historical record +- Ability to push and pull changes directly to collaborators or a + central server (ie a truly distributed version control system) + +The ability of version control systems to perform these tasks for +codebases is largely taken for granted by almost every developer today; +However, we are in-fact standing on the shoulders of giants, with +decades of engineering which has resulted in these phenomenally useful +tools. Now that a new era of \"Data-Defined software\" is taking hold, +we find there is a strong need for analogous version control systems +which are designed to handle numerical data at large scale\... Welcome +to Hangar! + +The Hangar Workflow: + + Checkout Branch + | + ▼ + Create/Access Data + | + ▼ + Add/Remove/Update Samples + | + ▼ + Commit + +Log Style Output: + +``` +* 5254ec (master) : merge commit combining training updates and new validation samples +|\ +| * 650361 (add-validation-data) : Add validation labels and image data in isolated branch +* | 5f15b4 : Add some metadata for later reference and add new training samples received after initial import +|/ +* baddba : Initial commit adding training images and labels +``` + +Learn more about what Hangar is all about at + + +Installation +------------ + +Hangar is in early alpha development release! + + pip install hangar + +Documentation +------------- + + + +Development +----------- + +To run the all tests run: + + tox + +Note, to combine the coverage data from all the tox environments run: + ++------+---------------------------------------------------------------+ +| Wind | set PYTEST_ADDOPTS=--cov-append | +| ows | tox | ++------+---------------------------------------------------------------+ +| Othe | PYTEST_ADDOPTS=--cov-append tox | +| r | | ++------+---------------------------------------------------------------+ diff --git a/README.rst b/README.rst deleted file mode 100644 index a3f2463f..00000000 --- a/README.rst +++ /dev/null @@ -1,160 +0,0 @@ -======== -Overview -======== - -.. start-badges - -.. list-table:: - :stub-columns: 1 - - * - docs - - |docs| - * - tests - - | |gh-build-status| |codecov| - | |lgtm| - * - package - - | |version| |wheel| |conda-forge| - | |supported-versions| |supported-implementations| - | |license| -.. |docs| image:: https://readthedocs.org/projects/hangar-py/badge/?style=flat - :target: https://readthedocs.org/projects/hangar-py - :alt: Documentation Status - -.. |gh-build-status| image:: https://github.com/tensorwerk/hangar-py/workflows/Run%20Test%20Suite/badge.svg?branch=master - :alt: Build Status - :target: https://github.com/tensorwerk/hangar-py/actions?query=workflow%3A%22Run+Test+Suite%22+branch%3Amaster+event%3Apush+is%3Acompleted - -.. |codecov| image:: https://codecov.io/gh/tensorwerk/hangar-py/branch/master/graph/badge.svg - :alt: Code Coverage - :target: https://codecov.io/gh/tensorwerk/hangar-py - -.. |lgtm| image:: https://img.shields.io/lgtm/grade/python/g/tensorwerk/hangar-py.svg?logo=lgtm&logoWidth=18 - :alt: Language grade: Python - :target: https://lgtm.com/projects/g/tensorwerk/hangar-py/context:python - -.. |version| image:: https://img.shields.io/pypi/v/hangar.svg - :alt: PyPI Package latest release - :target: https://pypi.org/project/hangar - -.. |license| image:: https://img.shields.io/github/license/tensorwerk/hangar-py - :alt: GitHub license - :target: https://github.com/tensorwerk/hangar-py/blob/master/LICENSE - -.. |conda-forge| image:: https://img.shields.io/conda/vn/conda-forge/hangar.svg - :alt: Conda-Forge Latest Version - :target: https://anaconda.org/conda-forge/hangar - -.. |wheel| image:: https://img.shields.io/pypi/wheel/hangar.svg - :alt: PyPI Wheel - :target: https://pypi.org/project/hangar - -.. |supported-versions| image:: https://img.shields.io/pypi/pyversions/hangar.svg - :alt: Supported versions - :target: https://pypi.org/project/hangar - -.. |supported-implementations| image:: https://img.shields.io/pypi/implementation/hangar.svg - :alt: Supported implementations - :target: https://pypi.org/project/hangar - - -.. end-badges - -Hangar is version control for tensor data. Commit, branch, merge, revert, and -collaborate in the data-defined software era. - -* Free software: Apache 2.0 license - -What is Hangar? -=============== - -Hangar is based off the belief that too much time is spent collecting, managing, -and creating home-brewed version control systems for data. At it's core Hangar -is designed to solve many of the same problems faced by traditional code version -control system (ie. ``Git``), just adapted for numerical data: - -* Time travel through the historical evolution of a dataset. -* Zero-cost Branching to enable exploratory analysis and collaboration -* Cheap Merging to build datasets over time (with multiple collaborators) -* Completely abstracted organization and management of data files on disk -* Ability to only retrieve a small portion of the data (as needed) while still - maintaining complete historical record -* Ability to push and pull changes directly to collaborators or a central server - (ie a truly distributed version control system) - -The ability of version control systems to perform these tasks for codebases is -largely taken for granted by almost every developer today; However, we are -in-fact standing on the shoulders of giants, with decades of engineering which -has resulted in these phenomenally useful tools. Now that a new era of -"Data-Defined software" is taking hold, we find there is a strong need for -analogous version control systems which are designed to handle numerical data at -large scale... Welcome to Hangar! - - -The Hangar Workflow: - -:: - - Checkout Branch - | - ▼ - Create/Access Data - | - ▼ - Add/Remove/Update Samples - | - ▼ - Commit - -Log Style Output: - -.. code-block:: text - - * 5254ec (master) : merge commit combining training updates and new validation samples - |\ - | * 650361 (add-validation-data) : Add validation labels and image data in isolated branch - * | 5f15b4 : Add some metadata for later reference and add new training samples received after initial import - |/ - * baddba : Initial commit adding training images and labels - - -Learn more about what Hangar is all about at https://hangar-py.readthedocs.io/ - - -Installation -============ - -Hangar is in early alpha development release! - -:: - - pip install hangar - -Documentation -============= - -https://hangar-py.readthedocs.io/ - - -Development -=========== - -To run the all tests run:: - - tox - -Note, to combine the coverage data from all the tox environments run: - -.. list-table:: - :widths: 10 90 - :stub-columns: 1 - - - - Windows - - :: - - set PYTEST_ADDOPTS=--cov-append - tox - - - - Other - - :: - - PYTEST_ADDOPTS=--cov-append tox diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..ebc76a63 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,141 @@ +Overview +======== + ++-----------------------------------+-----------------------------------+ +| docs | | ++-----------------------------------+-----------------------------------+ +| tests | | [![Build Status](https://github | +| | .com/tensorwerk/hangar-py/workflo | +| | ws/Run%20Test%20Suite/badge.svg?b | +| | ranch=master)](https://github.com | +| | /tensorwerk/hangar-py/actions?que | +| | ry=workflow%3A%22Run+Test+Suite%2 | +| | 2+branch%3Amaster+event%3Apush+is | +| | %3Acompleted) | +| | [![Code Coverage](https://codec | +| | ov.io/gh/tensorwerk/hangar-py/bra | +| | nch/master/graph/badge.svg)](http | +| | s://codecov.io/gh/tensorwerk/hang | +| | ar-py) | +| | | [![Language grade: Python](http | +| | s://img.shields.io/lgtm/grade/pyt | +| | hon/g/tensorwerk/hangar-py.svg?lo | +| | go=lgtm&logoWidth=18)](https://lg | +| | tm.com/projects/g/tensorwerk/hang | +| | ar-py/context:python) | ++-----------------------------------+-----------------------------------+ +| package | | [![PyPI Package latest release] | +| | (https://img.shields.io/pypi/v/ha | +| | ngar.svg)](https://pypi.org/proje | +| | ct/hangar) | +| | [![PyPI Wheel](https://img.shie | +| | lds.io/pypi/wheel/hangar.svg)](ht | +| | tps://pypi.org/project/hangar) | +| | [![Conda-Forge Latest Version]( | +| | https://img.shields.io/conda/vn/c | +| | onda-forge/hangar.svg)](https://a | +| | naconda.org/conda-forge/hangar) | +| | | [![Supported versions](https:// | +| | img.shields.io/pypi/pyversions/ha | +| | ngar.svg)](https://pypi.org/proje | +| | ct/hangar) | +| | [![Supported implementations](h | +| | ttps://img.shields.io/pypi/implem | +| | entation/hangar.svg)](https://pyp | +| | i.org/project/hangar) | +| | | [![GitHub license](https://img. | +| | shields.io/github/license/tensorw | +| | erk/hangar-py)](https://github.co | +| | m/tensorwerk/hangar-py/blob/maste | +| | r/LICENSE) | ++-----------------------------------+-----------------------------------+ + +Hangar is version control for tensor data. Commit, branch, merge, +revert, and collaborate in the data-defined software era. + +- Free software: Apache 2.0 license + +What is Hangar? +--------------- + +Hangar is based off the belief that too much time is spent collecting, +managing, and creating home-brewed version control systems for data. At +it\'s core Hangar is designed to solve many of the same problems faced +by traditional code version control system (ie. `Git`), just adapted for +numerical data: + +- Time travel through the historical evolution of a dataset. +- Zero-cost Branching to enable exploratory analysis and collaboration +- Cheap Merging to build datasets over time (with multiple + collaborators) +- Completely abstracted organization and management of data files on + disk +- Ability to only retrieve a small portion of the data (as needed) + while still maintaining complete historical record +- Ability to push and pull changes directly to collaborators or a + central server (ie a truly distributed version control system) + +The ability of version control systems to perform these tasks for +codebases is largely taken for granted by almost every developer today; +However, we are in-fact standing on the shoulders of giants, with +decades of engineering which has resulted in these phenomenally useful +tools. Now that a new era of \"Data-Defined software\" is taking hold, +we find there is a strong need for analogous version control systems +which are designed to handle numerical data at large scale\... Welcome +to Hangar! + +The Hangar Workflow: + + Checkout Branch + | + ▼ + Create/Access Data + | + ▼ + Add/Remove/Update Samples + | + ▼ + Commit + +Log Style Output: + +``` +* 5254ec (master) : merge commit combining training updates and new validation samples +|\ +| * 650361 (add-validation-data) : Add validation labels and image data in isolated branch +* | 5f15b4 : Add some metadata for later reference and add new training samples received after initial import +|/ +* baddba : Initial commit adding training images and labels +``` + +Learn more about what Hangar is all about at + + +Installation +------------ + +Hangar is in early alpha development release! + + pip install hangar + +Documentation +------------- + + + +Development +----------- + +To run the all tests run: + + tox + +Note, to combine the coverage data from all the tox environments run: + ++------+---------------------------------------------------------------+ +| Wind | set PYTEST_ADDOPTS=--cov-append | +| ows | tox | ++------+---------------------------------------------------------------+ +| Othe | PYTEST_ADDOPTS=--cov-append tox | +| r | | ++------+---------------------------------------------------------------+ diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 00000000..933cdb4b --- /dev/null +++ b/docs/api.md @@ -0,0 +1,75 @@ +Python API +========== + +This is the python API for the Hangar project. + +Repository +---------- + +::: hangar.repository + +Remotes +------- + +::: hangar.remotes.Remotes + +Write Enabled Checkout +---------------------- + +### Checkout + +::: hangar.checkout.WriterCheckout + +### Columns + +::: hangar.columns.column.Columns + +### Flat Column Layout Container + +::: hangar.columns.layout_flat.FlatSampleWriter + +### Nested Column Layout Container + +::: hangar.columns.layout_nested.NestedSampleWriter + +::: hangar.columns.layout_nested.FlatSubsampleWriter + +### Differ + +::: hangar.diff.WriterUserDiff + +### Bulk Importer + +::: hangar.bulk_importer.run_bulk_import + +Read Only Checkout +------------------ + +### Checkout + +::: hangar.checkout.ReaderCheckout + +### Flat Column Layout Container + +::: hangar.columns.layout_flat.FlatSampleReader + +### Nested Column Layout Container + +::: hangar.columns.layout_nested.NestedSampleReader + +:::hangar.columns.layout_nested.FlatSubsampleReader + +### Differ + +::: hangar.diff.ReaderUserDiff + +ML Framework Dataloaders +------------------------ + +### Tensorflow + +::: hangar.make_tf_dataset + +### Pytorch + +::: hangar.make_torch_dataset diff --git a/docs/api.rst b/docs/api.rst deleted file mode 100644 index 47aa0971..00000000 --- a/docs/api.rst +++ /dev/null @@ -1,140 +0,0 @@ -.. _ref-api: - -========== -Python API -========== - -This is the python API for the Hangar project. - - -Repository -========== - -.. automodule:: hangar.repository - :members: - -Remotes -======= - -.. autoclass:: Remotes() - :members: - :exclude-members: __init__ - - -Write Enabled Checkout -====================== - -Checkout --------- - -.. autoclass:: hangar.checkout.WriterCheckout() - :members: - :inherited-members: - :special-members: __getitem__, __setitem__, __len__, __contains__, __iter__ - :exclude-members: __init__ - -Columns -------- - -.. autoclass:: hangar.columns.column.Columns() - :members: - :special-members: __getitem__, __setitem__, __delitem__, __contains__, __len__, __iter__ - :exclude-members: __init__ - -Flat Column Layout Container ----------------------------- - -.. autoclass:: hangar.columns.layout_flat.FlatSampleWriter() - :members: - :inherited-members: - :special-members: __getitem__, __setitem__, __delitem__, __contains__, __len__, __iter__ - :exclude-members: __init__ - -Nested Column Layout Container ------------------------------- - -.. autoclass:: hangar.columns.layout_nested.NestedSampleWriter() - :members: - :inherited-members: - :special-members: __getitem__, __setitem__, __delitem__, __contains__, __len__, __iter__ - :exclude-members: __init__ - -.. autoclass:: hangar.columns.layout_nested.FlatSubsampleWriter() - :members: - :inherited-members: - :special-members: __getitem__, __setitem__, __delitem__, __contains__, __len__, __iter__ - :exclude-members: __init__ - -Differ ------- - -.. autoclass:: hangar.diff.WriterUserDiff() - :members: - :exclude-members: __init__ - -Bulk Importer -------------- - -.. automodule:: hangar.bulk_importer - :members: - - -Read Only Checkout -================== - -Checkout --------- - -.. autoclass:: hangar.checkout.ReaderCheckout() - :members: - :inherited-members: - :special-members: __getitem__, __len__, __contains__, __iter__ - :exclude-members: __init__ - - -Flat Column Layout Container ----------------------------- - -.. autoclass:: hangar.columns.layout_flat.FlatSampleReader() - :members: - :inherited-members: - :special-members: __getitem__, __setitem__, __contains__, __len__, __iter__ - :exclude-members: __init__ - - -Nested Column Layout Container ------------------------------- - -.. autoclass:: hangar.columns.layout_nested.NestedSampleReader() - :members: - :inherited-members: - :special-members: __getitem__, __contains__, __len__, __iter__ - :exclude-members: __init__ - -.. autoclass:: hangar.columns.layout_nested.FlatSubsampleReader() - :members: - :inherited-members: - :special-members: __getitem__,, __contains__, __len__, __iter__ - :exclude-members: __init__ - - -Differ ------- - -.. autoclass:: hangar.diff.ReaderUserDiff() - :members: - :exclude-members: __init__ - - -ML Framework Dataloaders -======================== - -Tensorflow ----------- - -.. autofunction:: hangar.make_tf_dataset - -Pytorch -------- - -.. autofunction:: hangar.make_torch_dataset diff --git a/docs/authors.md b/docs/authors.md new file mode 100644 index 00000000..0a9625b5 --- /dev/null +++ b/docs/authors.md @@ -0,0 +1,7 @@ +Authors +======= + +- Richard Izzo - +- Luca Antiga - +- Sherin Thomas - +- Alessia Marcolini - diff --git a/docs/authors.rst b/docs/authors.rst deleted file mode 100644 index e122f914..00000000 --- a/docs/authors.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../AUTHORS.rst diff --git a/docs/backends.rst b/docs/backends.rst deleted file mode 100644 index 8e8f9f0d..00000000 --- a/docs/backends.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _ref-backends: - -.. note:: - - The following documentation contains highly technical descriptions of the - data writing and loading backends of the Hangar core. It is intended for - developer use only, with the functionality described herein being completely - hidden from regular users. - - Any questions or comments can be directed to the `Hangar Github Issues Page - `_ - -================= -Backend selection -================= - -.. automodule:: hangar.backends.__init__ - - -Backend Specifications -====================== - -.. toctree:: - :maxdepth: 2 - :titlesonly: - - ./backends/hdf5_00 - ./backends/hdf5_01 - ./backends/numpy_10 - ./backends/lmdb_30 - ./backends/remote_50 diff --git a/docs/backends/backends.md b/docs/backends/backends.md new file mode 100644 index 00000000..1ea73ac9 --- /dev/null +++ b/docs/backends/backends.md @@ -0,0 +1,15 @@ +Backend selection +================= + +!!! note + + The following documentation contains highly technical descriptions of + the data writing and loading backends of the Hangar core. It is intended + for developer use only, with the functionality described herein being + completely hidden from regular users. + + Any questions or comments can be directed to the [Hangar Github Issues + Page](https://github.com/tensorwerk/hangar-py/issues) + +::: hangar.backends.__init__ + diff --git a/docs/backends/hdf5_00.md b/docs/backends/hdf5_00.md new file mode 100644 index 00000000..ecaa0d3a --- /dev/null +++ b/docs/backends/hdf5_00.md @@ -0,0 +1,4 @@ +Local HDF5 Backend +================== + +::: hangar.backends.hdf5_00 diff --git a/docs/backends/hdf5_00.rst b/docs/backends/hdf5_00.rst deleted file mode 100644 index 1d500e39..00000000 --- a/docs/backends/hdf5_00.rst +++ /dev/null @@ -1,4 +0,0 @@ -Local HDF5 Backend -================== - -.. automodule:: hangar.backends.hdf5_00 diff --git a/docs/backends/hdf5_01.rst b/docs/backends/hdf5_01.md similarity index 63% rename from docs/backends/hdf5_01.rst rename to docs/backends/hdf5_01.md index d39ac4cf..95e68cfe 100644 --- a/docs/backends/hdf5_01.rst +++ b/docs/backends/hdf5_01.md @@ -1,4 +1,4 @@ Fixed Shape Optimized Local HDF5 ================================ -.. automodule:: hangar.backends.hdf5_01 \ No newline at end of file +::: hangar.backends.hdf5_01 diff --git a/docs/backends/lmdb_30.rst b/docs/backends/lmdb_30.md similarity index 65% rename from docs/backends/lmdb_30.rst rename to docs/backends/lmdb_30.md index c5f652e4..0fe28d29 100644 --- a/docs/backends/lmdb_30.rst +++ b/docs/backends/lmdb_30.md @@ -1,4 +1,4 @@ Variable Shape LMDB String Data Store ===================================== -.. automodule:: hangar.backends.lmdb_30 +::: hangar.backends.lmdb_30 diff --git a/docs/backends/numpy_10.rst b/docs/backends/numpy_10.md similarity index 54% rename from docs/backends/numpy_10.rst rename to docs/backends/numpy_10.md index 30d47f45..b98a45f2 100644 --- a/docs/backends/numpy_10.rst +++ b/docs/backends/numpy_10.md @@ -1,4 +1,4 @@ Local NP Memmap Backend ======================= -.. automodule:: hangar.backends.numpy_10 +::: hangar.backends.numpy_10 diff --git a/docs/backends/remote_50.rst b/docs/backends/remote_50.md similarity index 59% rename from docs/backends/remote_50.rst rename to docs/backends/remote_50.md index 901a7e7b..2fb22095 100644 --- a/docs/backends/remote_50.rst +++ b/docs/backends/remote_50.md @@ -1,4 +1,4 @@ Remote Server Unknown Backend ============================= -.. automodule:: hangar.backends.remote_50 +::: hangar.backends.remote_50 diff --git a/docs/benchmarking.md b/docs/benchmarking.md new file mode 100644 index 00000000..047c0110 --- /dev/null +++ b/docs/benchmarking.md @@ -0,0 +1,139 @@ +Hangar Performance Benchmarking Suite +===================================== + +A set of benchmarking tools are included in order to track the +performance of common hangar operations over the course of time. The +benchmark suite is run via the phenomenal [Airspeed Velocity +(ASV)](https://asv.readthedocs.io/) project. + +Benchmarks can be viewed at the following web link, or by examining the +raw data files in the separate benchmark results repo. + +- [Benchmark Web View](https://tensorwerk.com/hangar-benchmarks) +- [Benchmark Results + Repo](https://github.com/tensorwerk/hangar-benchmarks) + +![](./img/asv-detailed.png){.align-center} + +Purpose +------- + +In addition to providing historical metrics and insight into application +performance over many releases of Hangar, \*the benchmark suite is used +as a canary to identify potentially problematic pull requests.\* All PRs +to the Hangar repository are automatically benchmarked by our CI system +to compare the performance of proposed changes to that of the current +`master` branch. + +\*The results of this canary are explicitly NOT to be used as the +\"be-all-end-all\" decider of whether a PR is suitable to be merged or +not.\* + +Instead, it is meant to serve the following purposes: + +1. \*\*Help contributors understand the consequences of some set of + changes on the greater system early in the PR process.\*\* Simple + code is best; if there\'s no obvious performance degradation or + significant improvement to be had, then there\'s no need (or really + rationale) for using more complex algorithms or data structures. + It\'s more work for the author, project maintainers, and long term + health of the codebase. +2. \*\*Not everything can be caught by the capabilities of a + traditional test suite.\*\* Hangar is fairly flat/modular in + structure, but there are certain hotspots in the codebase where a + simple change could drastically degrade performance. It\'s not + always obvious where these hotspots are, and even a change which is + functionally identical (introducing no issues/bugs to the end user) + can unknowingly cross a line and introduce some large regression + completely unnoticed to the authors/reviewers. +3. Sometimes tradeoffs need to be made when introducing something new + to a system. Whether this be due to fundamental CS problems (space + vs. time) or simple matters of practicality vs. purity, it\'s always + easier to act in environments where relevant information is + available before a decision is made. \*\*Identifying and quantifying + tradeoffs/regressions/benefits during development is the only way we + can make informed decisions.\*\* The only times to be OK with some + regression is when knowing about it in advance, it might be the + right choice at the time, but if we don\'t measure we will never + know. + +Important Notes on Using/Modifying the Benchmark Suite +------------------------------------------------------ + +1. \*\*Do not commit any of the benchmark results, environment files, + or generated visualizations to the repository\*\*. We store + benchmark results in a [separate repository + \]() so to not + clutter the main repo with un-necessary data. The default + directories these are generated in are excluded in our `.gitignore` + config, so baring some unusual git usage patterns, this should not + be a day-to-day concern. +2. Proposed changes to the benchmark suite should be made to the code + in this repository first. The benchmark results repository mirror + will be synchronized upon approval/merge of changes to the main + Hangar repo. + +Introduction to Running Benchmarks +---------------------------------- + +As ASV sets up and manages it\'s own virtual environments and source +installations, benchmark execution is not run via `tox`. While a brief +tutorial is included below, please refer to the [ASV Docs +\]() for detailed information on how to +both run, understand, and write ASV benchmarks. + +### First Time Setup + +1. Ensure that `virtualenv`, `setuptools`, `pip` are updated to the + latest version. +2. Install ASV `$ pip install asv`. +3. Open a terminal and navigate to the `hangar-py/asv-bench` directory. +4. Run `$ asv machine` to record details of your machine, it is OK to + just use the defaults. + +### Running Benchmarks + +Refer to the [using ASV +\]() +page for a full tutorial, paying close attention to the [asv run +\]() +command. Generally `asv run` requires a range of commits to benchmark +across (specified via either branch name, tags, or commit digests). + +To benchmark every commit between the current master `HEAD` and +`v0.3.0`, you would execute: + + $ asv run v0.2.0..master + +However, this may result in a larger workload then you are willing to +wait around for. To limit the number of commits, you can specify the +`--steps=N` option to only benchmark `N` commits at most between `HEAD` +and `v0.3.0`. + +The most useful tool during development is the [asv continuous +\]() +command. using the following syntax will benchmark any changes in a +local development branch against the base `master` commit: + + $ asv continuous origin/master HEAD + +Running [asv compare +\]() +will generate a quick summary of any performance differences: + + $ asv compare origin/master HEAD + +### Visualizing Results + +After generating benchmark data for a number of commits through history, +the results can be reviewed in (an automatically generated) local web +interface by running the following commands: + + $ asv publish + $ asv preview + +Navigating to `http://127.0.0.1:8080/` will pull up an interactive +webpage where the full set of benchmark graphs/explorations utilities +can be viewed. This will look something like the image below. + +![](./img/asv-main.png){.align-center} diff --git a/docs/benchmarking.rst b/docs/benchmarking.rst deleted file mode 100644 index 56dc8349..00000000 --- a/docs/benchmarking.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../asv_bench/README.rst \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 00000000..6d78f9b3 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,447 @@ +Change Log +========== + +[v0.5.2]() (2020-05-08) +----------------------- + +### New Features + +- New column data type supporting arbitrary `bytes` data. + ([\#198](https://github.com/tensorwerk/hangar-py/pull/198)) + [\@rlizzo](https://github.com/rlizzo) + +### Improvements + +- `str` typed columns can now accept data containing any unicode + code-point. In prior releases data containing any `non-ascii` + character could not be written to this column type. + ([\#198](https://github.com/tensorwerk/hangar-py/pull/198)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Fixed issue where `str` and (newly added) `bytes` column data could + not be fetched / pushed between a local client repository and remote + server. ([\#198](https://github.com/tensorwerk/hangar-py/pull/198)) + [\@rlizzo](https://github.com/rlizzo) + +[v0.5.1]() (2020-04-05) +----------------------- + +### BugFixes + +- Fixed issue where importing `make_torch_dataloader` or + `make_tf_dataloader` under python 3.6 Would raise a `NameError` + irrigardless of if the package is installed. + ([\#196](https://github.com/tensorwerk/hangar-py/pull/196)) + [\@rlizzo](https://github.com/rlizzo) + +[v0.5.0]() (2020-04-4) +---------------------- + +### Improvements + +- Python 3.8 is now fully supported. + ([\#193](https://github.com/tensorwerk/hangar-py/pull/193)) + [\@rlizzo](https://github.com/rlizzo) +- Major backend overhaul which defines column layouts and data types + in the same interchangable / extensable manner as storage backends. + This will allow rapid development of new layouts and data type + support as new use cases are discovered by the community. + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- Column and backend classes are now fully serializable (pickleable) + for `read-only` checkouts. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Modularized internal structure of API classes to easily allow new + columnn layouts / data types to be added in the future. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Improved type / value checking of manual specification for column + `backend` and `backend_options`. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Standardized column data access API to follow python standard + library `dict` methods API. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Memory usage of arrayset checkouts has been reduced by \~70% by + using C-structs for allocating sample record locating info. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- Read times from the `HDF5_00` and `HDF5_01` backend have been + reduced by 33-38% (or more for arraysets with many samples) by + eliminating redundant computation of chunked storage B-Tree. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- Commit times and checkout times have been reduced by 11-18% by + optimizing record parsing and memory allocation. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) + +### New Features + +- Added `str` type column with same behavior as `ndarray` column + (supporting both single-level and nested layouts) added to replace + functionality of removed `metadata` container. + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- New backend based on `LMDB` has been added (specifier of `lmdb_30`). + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- Added `.diff()` method to `Repository` class to enable diffing + changes between any pair of commits / branches without needing to + open the diff base in a checkout. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) +- New CLI command `hangar diff` which reports a summary view of + changes made between any pair of commits / branches. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) +- Added `.log()` method to `Checkout` objects so graphical commit + graph or machine readable commit details / DAG can be queried when + operating on a particular commit. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) +- \"string\" type columns now supported alongside \"ndarray\" column + type. ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- New \"column\" API, which replaces \"arrayset\" name. + ([\#180](https://github.com/tensorwerk/hangar-py/pull/180)) + [\@rlizzo](https://github.com/rlizzo) +- Arraysets can now contain \"nested subsamples\" under a common + sample key. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- New API to add and remove samples from and arrayset. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) +- Added `repo.size_nbytes` and `repo.size_human` to report disk usage + of a repository on disk. + ([\#174](https://github.com/tensorwerk/hangar-py/pull/174)) + [\@rlizzo](https://github.com/rlizzo) +- Added method to traverse the entire repository history and + cryptographically verify integrity. + ([\#173](https://github.com/tensorwerk/hangar-py/pull/173)) + [\@rlizzo](https://github.com/rlizzo) + +### Changes + +- Argument syntax of `__getitem__()` and `get()` methods of + `ReaderCheckout` and `WriterCheckout` classes. The new format + supports handeling arbitrary arguments specific to retrieval of data + from any column type. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) + +### Removed + +- `metadata` container for `str` typed data has been completly + removed. It is replaced by a highly extensible and much more + user-friendly `str` typed column. + ([\#184](https://github.com/tensorwerk/hangar-py/pull/184)) + [\@rlizzo](https://github.com/rlizzo) +- `__setitem__()` method in `WriterCheckout` objects. Writing data to + columns via a checkout object is no longer supported. + ([\#183](https://github.com/tensorwerk/hangar-py/pull/183)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Backend data stores no longer use file symlinks, improving + compatibility with some types file systems. + ([\#171](https://github.com/tensorwerk/hangar-py/pull/171)) + [\@rlizzo](https://github.com/rlizzo) +- All arrayset types (\"flat\" and \"nested subsamples\") and backend + readers can now be pickled \-- for parallel processing \-- in a + read-only checkout. + ([\#179](https://github.com/tensorwerk/hangar-py/pull/179)) + [\@rlizzo](https://github.com/rlizzo) + +### Breaking changes + +- New backend record serialization format is incompatible with + repositories written in version 0.4 or earlier. +- New arrayset API is incompatible with Hangar API in version 0.4 or + earlier. + +[v0.4.0]() (2019-11-21) +----------------------- + +### New Features + +- Added ability to delete branch names/pointers from a local + repository via both API and CLI. + ([\#128](https://github.com/tensorwerk/hangar-py/pull/128)) + [\@rlizzo](https://github.com/rlizzo) +- Added `local` keyword arg to arrayset key/value iterators to return + only locally available samples + ([\#131](https://github.com/tensorwerk/hangar-py/pull/131)) + [\@rlizzo](https://github.com/rlizzo) +- Ability to change the backend storage format and options applied to + an `arrayset` after initialization. + ([\#133](https://github.com/tensorwerk/hangar-py/pull/133)) + [\@rlizzo](https://github.com/rlizzo) +- Added blosc compression to HDF5 backend by default on PyPi + installations. + ([\#146](https://github.com/tensorwerk/hangar-py/pull/146)) + [\@rlizzo](https://github.com/rlizzo) +- Added Benchmarking Suite to Test for Performance Regressions in PRs. + ([\#155](https://github.com/tensorwerk/hangar-py/pull/155)) + [\@rlizzo](https://github.com/rlizzo) +- Added new backend optimized to increase speeds for fixed size + arrayset access. + ([\#160](https://github.com/tensorwerk/hangar-py/pull/160)) + [\@rlizzo](https://github.com/rlizzo) + +### Improvements + +- Removed `msgpack` and `pyyaml` dependencies. Cleaned up and improved + remote client/server code. + ([\#130](https://github.com/tensorwerk/hangar-py/pull/130)) + [\@rlizzo](https://github.com/rlizzo) +- Multiprocess Torch DataLoaders allowed on Linux and MacOS. + ([\#144](https://github.com/tensorwerk/hangar-py/pull/144)) + [\@rlizzo](https://github.com/rlizzo) +- Added CLI options `commit`, `checkout`, `arrayset create`, & + `arrayset remove`. + ([\#150](https://github.com/tensorwerk/hangar-py/pull/150)) + [\@rlizzo](https://github.com/rlizzo) +- Plugin system revamp. + ([\#134](https://github.com/tensorwerk/hangar-py/pull/134)) + [\@hhsecond](https://github.com/hhsecond) +- Documentation Improvements and Typo-Fixes. + ([\#156](https://github.com/tensorwerk/hangar-py/pull/156)) + [\@alessiamarcolini](https://github.com/alessiamarcolini) +- Removed implicit removal of arrayset schema from checkout if every + sample was removed from arrayset. This could potentially result in + dangling accessors which may or may not self-destruct (as expected) + in certain edge-cases. + ([\#159](https://github.com/tensorwerk/hangar-py/pull/159)) + [\@rlizzo](https://github.com/rlizzo) +- Added type codes to hash digests so that calculation function can be + updated in the future without breaking repos written in previous + Hangar versions. + ([\#165](https://github.com/tensorwerk/hangar-py/pull/165)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Programatic access to repository log contents now returns branch + heads alongside other log info. + ([\#125](https://github.com/tensorwerk/hangar-py/pull/125)) + [\@rlizzo](https://github.com/rlizzo) +- Fixed minor bug in types of values allowed for `Arrayset` names vs + `Sample` names. + ([\#151](https://github.com/tensorwerk/hangar-py/pull/151)) + [\@rlizzo](https://github.com/rlizzo) +- Fixed issue where using checkout object to access a sample in + multiple arraysets would try to create a `namedtuple` instance with + invalid field names. Now incompatible field names are automatically + renamed with their positional index. + ([\#161](https://github.com/tensorwerk/hangar-py/pull/161)) + [\@rlizzo](https://github.com/rlizzo) +- Explicitly raise error if `commit` argument is set while checking + out a repository with `write=True`. + ([\#166](https://github.com/tensorwerk/hangar-py/pull/166)) + [\@rlizzo](https://github.com/rlizzo) + +### Breaking changes + +- New commit reference serialization format is incompatible with + repositories written in version 0.3.0 or earlier. + +[v0.3.0]() (2019-09-10) +----------------------- + +### New Features + +- API addition allowing reading and writing arrayset data from a + checkout object directly. + ([\#115](https://github.com/tensorwerk/hangar-py/pull/115)) + [\@rlizzo](https://github.com/rlizzo) +- Data importer, exporters, and viewers via CLI for common file + formats. Includes plugin system for easy extensibility in the + future. ([\#103](https://github.com/tensorwerk/hangar-py/pull/103)) + ([\@rlizzo](https://github.com/rlizzo), + [\@hhsecond](https://github.com/hhsecond)) + +### Improvements + +- Added tutorial on working with remote data. + ([\#113](https://github.com/tensorwerk/hangar-py/pull/113)) + [\@rlizzo](https://github.com/rlizzo) +- Added Tutorial on Tensorflow and PyTorch Dataloaders. + ([\#117](https://github.com/tensorwerk/hangar-py/pull/117)) + [\@hhsecond](https://github.com/hhsecond) +- Large performance improvement to diff/merge algorithm (\~30x + previous). + ([\#112](https://github.com/tensorwerk/hangar-py/pull/112)) + [\@rlizzo](https://github.com/rlizzo) +- New commit hash algorithm which is much more reproducible in the + long term. + ([\#120](https://github.com/tensorwerk/hangar-py/pull/120)) + [\@rlizzo](https://github.com/rlizzo) +- HDF5 backend updated to increase speed of reading/writing variable + sized dataset compressed chunks + ([\#120](https://github.com/tensorwerk/hangar-py/pull/120)) + [\@rlizzo](https://github.com/rlizzo) + +### Bug Fixes + +- Fixed ML Dataloaders errors for a number of edge cases surrounding + partial-remote data and non-common keys. + ([\#110](https://github.com/tensorwerk/hangar-py/pull/110)) ( + [\@hhsecond](https://github.com/hhsecond), + [\@rlizzo](https://github.com/rlizzo)) + +### Breaking changes + +- New commit hash algorithm is incompatible with repositories written + in version 0.2.0 or earlier + +[v0.2.0]() (2019-08-09) +----------------------- + +### New Features + +- Numpy memory-mapped array file backend added. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Remote server data backend added. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Selection heuristics to determine appropriate backend from arrayset + schema. ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Partial remote clones and fetch operations now fully supported. + ([\#85](https://github.com/tensorwerk/hangar-py/pull/85)) + [\@rlizzo](https://github.com/rlizzo) +- CLI has been placed under test coverage, added interface usage to + docs. ([\#85](https://github.com/tensorwerk/hangar-py/pull/85)) + [\@rlizzo](https://github.com/rlizzo) +- TensorFlow and PyTorch Machine Learning Dataloader Methods + (*Experimental Release*). + ([\#91](https://github.com/tensorwerk/hangar-py/pull/91)) lead: + [\@hhsecond](https://github.com/hhsecond), co-author: + [\@rlizzo](https://github.com/rlizzo), reviewed by: + [\@elistevens](https://github.com/elistevens) + +### Improvements + +- Record format versioning and standardization so to not break + backwards compatibility in the future. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Backend addition and update developer protocols and documentation. + ([\#70](https://github.com/tensorwerk/hangar-py/pull/70)) + [\@rlizzo](https://github.com/rlizzo) +- Read-only checkout arrayset sample `get` methods now are multithread + and multiprocess safe. + ([\#84](https://github.com/tensorwerk/hangar-py/pull/84)) + [\@rlizzo](https://github.com/rlizzo) +- Read-only checkout metadata sample `get` methods are thread safe if + used within a context manager. + ([\#101](https://github.com/tensorwerk/hangar-py/pull/101)) + [\@rlizzo](https://github.com/rlizzo) +- Samples can be assigned integer names in addition to `string` names. + ([\#89](https://github.com/tensorwerk/hangar-py/pull/89)) + [\@rlizzo](https://github.com/rlizzo) +- Forgetting to close a `write-enabled` checkout before terminating + the python process will close the checkout automatically for many + situations. + ([\#101](https://github.com/tensorwerk/hangar-py/pull/101)) + [\@rlizzo](https://github.com/rlizzo) +- Repository software version compatability methods added to ensure + upgrade paths in the future. + ([\#101](https://github.com/tensorwerk/hangar-py/pull/101)) + [\@rlizzo](https://github.com/rlizzo) +- Many tests added (including support for Mac OSX on Travis-CI). lead: + [\@rlizzo](https://github.com/rlizzo), co-author: + [\@hhsecond](https://github.com/hhsecond) + +### Bug Fixes + +- Diff results for fast forward merges now returns sensible results. + ([\#77](https://github.com/tensorwerk/hangar-py/pull/77)) + [\@rlizzo](https://github.com/rlizzo) +- Many type annotations added, and developer documentation improved. + [\@hhsecond](https://github.com/hhsecond) & + [\@rlizzo](https://github.com/rlizzo) + +### Breaking changes + +- Renamed all references to `datasets` in the API / world-view to + `arraysets`. +- These are backwards incompatible changes. For all versions \> 0.2, + repository upgrade utilities will be provided if breaking changes + occur. + +[v0.1.1]() (2019-05-24) +----------------------- + +### Bug Fixes + +- Fixed typo in README which was uploaded to PyPi + +[v0.1.0]() (2019-05-24) +----------------------- + +### New Features + +- Remote client-server config negotiation and administrator + permissions. + ([\#10](https://github.com/tensorwerk/hangar-py/pull/10)) + [\@rlizzo](https://github.com/rlizzo) +- Allow single python process to access multiple repositories + simultaneously. + ([\#20](https://github.com/tensorwerk/hangar-py/pull/20)) + [\@rlizzo](https://github.com/rlizzo) +- Fast-Forward and 3-Way Merge and Diff methods now fully supported + and behaving as expected. + ([\#32](https://github.com/tensorwerk/hangar-py/pull/32)) + [\@rlizzo](https://github.com/rlizzo) + +### Improvements + +- Initial test-case specification. + ([\#14](https://github.com/tensorwerk/hangar-py/pull/14)) + [\@hhsecond](https://github.com/hhsecond) +- Checkout test-case work. + ([\#25](https://github.com/tensorwerk/hangar-py/pull/25)) + [\@hhsecond](https://github.com/hhsecond) +- Metadata test-case work. + ([\#27](https://github.com/tensorwerk/hangar-py/pull/27)) + [\@hhsecond](https://github.com/hhsecond) +- Any potential failure cases raise exceptions instead of silently + returning. ([\#16](https://github.com/tensorwerk/hangar-py/pull/16)) + [\@rlizzo](https://github.com/rlizzo) +- Many usability improvements in a variety of commits. + +### Bug Fixes + +- Ensure references to checkout arrayset or metadata objects cannot + operate after the checkout is closed. + ([\#41](https://github.com/tensorwerk/hangar-py/pull/41)) + [\@rlizzo](https://github.com/rlizzo) +- Sensible exception classes and error messages raised on a variety of + situations (Many commits). [\@hhsecond](https://github.com/hhsecond) + & [\@rlizzo](https://github.com/rlizzo) +- Many minor issues addressed. + +### API Additions + +- Refer to API documentation + ([\#23](https://github.com/tensorwerk/hangar-py/pull/23)) + +### Breaking changes + +- All repositories written with previous versions of Hangar are liable + to break when using this version. Please upgrade versions + immediately. + +[v0.0.0]() (2019-04-15) +----------------------- + +- First Public Release of Hangar! diff --git a/docs/changelog.rst b/docs/changelog.rst deleted file mode 100644 index 565b0521..00000000 --- a/docs/changelog.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../CHANGELOG.rst diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 00000000..617d036d --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,12 @@ +Hangar CLI Documentation +======================== + +The CLI described below is automatically available after the Hangar +Python package has been installed (either through a package manager or +via source builds). In general, the commands require the terminals `cwd` +to be at the same level the repository was initially created in. + +Simply start by typing `$ hangar --help` in your terminal to get +started! + +::: hangar.cli diff --git a/docs/cli.rst b/docs/cli.rst deleted file mode 100644 index 6419cb86..00000000 --- a/docs/cli.rst +++ /dev/null @@ -1,13 +0,0 @@ -Hangar CLI Documentation -======================== - -The CLI described below is automatically available after the Hangar Python -package has been installed (either through a package manager or via source -builds). In general, the commands require the terminals ``cwd`` to be at the -same level the repository was initially created in. - -Simply start by typing ``$ hangar --help`` in your terminal to get started! - -.. click:: hangar.cli:main - :prog: hangar - :show-nested: diff --git a/docs/codeofconduct.md b/docs/codeofconduct.md new file mode 100644 index 00000000..73a0ffba --- /dev/null +++ b/docs/codeofconduct.md @@ -0,0 +1,87 @@ +Contributor Code of Conduct +=========================== + +Our Pledge +---------- + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our +project and our community a harassment-free experience for everyone, +regardless of age, body size, disability, ethnicity, sex +characteristics, gender identity and expression, level of experience, +education, socio-economic status, nationality, personal appearance, +race, religion, or sexual identity and orientation. + +Our Standards +------------- + +Examples of behavior that contributes to creating a positive environment +include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual + attention or advances +- Trolling, insulting/derogatory comments, and personal or political + attacks +- Public or private harassment +- Publishing others\' private information, such as a physical or + electronic address, without explicit permission +- Other conduct which could reasonably be considered inappropriate in + a professional setting + +Our Responsibilities +-------------------- + +Project maintainers are responsible for clarifying the standards of +acceptable behavior and are expected to take appropriate and fair +corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, +or reject comments, commits, code, wiki edits, issues, and other +contributions that are not aligned to this Code of Conduct, or to ban +temporarily or permanently any contributor for other behaviors that they +deem inappropriate, threatening, offensive, or harmful. + +Scope +----- + +This Code of Conduct applies both within project spaces and in public +spaces when an individual is representing the project or its community. +Examples of representing a project or community include using an +official project e-mail address, posting via an official social media +account, or acting as an appointed representative at an online or +offline event. Representation of a project may be further defined and +clarified by project maintainers. + +Enforcement +----------- + +Instances of abusive, harassing, or otherwise unacceptable behavior may +be reported by contacting the project team at +`hangar.info@tensorwerk.com`. All +complaints will be reviewed and investigated and will result in a +response that is deemed necessary and appropriate to the circumstances. +The project team is obligated to maintain confidentiality with regard to +the reporter of an incident. Further details of specific enforcement +policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in +good faith may face temporary or permanent repercussions as determined +by other members of the project\'s leadership. + +Attribution +----------- + +This Code of Conduct is adapted from the [Contributor Covenant]() +homepage, version 1.4, available at + + +For answers to common questions about this code of conduct, see + diff --git a/docs/codeofconduct.rst b/docs/codeofconduct.rst deleted file mode 100644 index 734c9663..00000000 --- a/docs/codeofconduct.rst +++ /dev/null @@ -1,3 +0,0 @@ -.. _ref-code-of-conduct: - -.. include:: ../CODE_OF_CONDUCT.rst \ No newline at end of file diff --git a/docs/concepts.md b/docs/concepts.md new file mode 100644 index 00000000..b0ff3890 --- /dev/null +++ b/docs/concepts.md @@ -0,0 +1,582 @@ +Hangar Core Concepts +==================== + +!!! warning + + The usage info displayed in the `latest` build of the project + documentation do not reflect recent changes to the API and internal + structure of the project. They should not be relied on at the current + moment; they will be updated over the next weeks, and will be in line + before the next release. + +This document provides a high level overview of the problems Hangar is +designed to solve and introduces the core concepts for beginning to use +Hangar. + +What Is Hangar? +--------------- + +At its core Hangar is designed to solve many of the same problems faced +by traditional code version control system (ie. `Git`), just adapted for +numerical data: + +- Time travel through the historical evolution of a dataset +- Zero-cost Branching to enable exploratory analysis and collaboration +- Cheap Merging to build datasets over time (with multiple + collaborators) +- Completely abstracted organization and management of data files on + disk +- Ability to only retrieve a small portion of the data (as needed) + while still maintaining complete historical record +- Ability to push and pull changes directly to collaborators or a + central server (ie. a truly distributed version control system) + +The ability of version control systems to perform these tasks for +codebases is largely taken for granted by almost every developer today; +however, we are in-fact standing on the shoulders of giants, with +decades of engineering which has resulted in these phenomenally useful +tools. Now that a new era of \"Data-Defined software\" is taking hold, +we find there is a strong need for analogous version control systems +which are designed to handle numerical data at large scale\... Welcome +to Hangar! + +Inspiration +----------- + +The design of Hangar was heavily influenced by the +[Git](https://git-scm.org) source-code version control system. As a +Hangar user, many of the fundamental building blocks and commands can be +thought of as interchangeable: + +- checkout +- commit +- branch +- merge +- diff +- push +- pull/fetch +- log + +Emulating the high level the git syntax has allowed us to create a user +experience which should be familiar in many ways to Hangar users; a goal +of the project is to enable many of the same VCS workflows developers +use for code while working with their data! + +There are, however, many fundamental differences in how humans/programs +interpret and use text in source files vs. numerical data which raise +many questions Hangar needs to uniquely solve: + +- How do we connect some piece of \"Data\" with a meaning in the real + world? +- How do we diff and merge large collections of data samples? +- How can we resolve conflicts? +- How do we make data access (reading and writing) convenient for both + user-driven exploratory analyses and high performance production + systems operating without supervision? +- How can we enable people to work on huge datasets in a local (laptop + grade) development environment? + +We will show how Hangar solves these questions in a high-level guide +below. For a deep dive into the Hangar internals, we invite you to check +out the `ref-hangar-under-the-hood`{.interpreted-text role="ref"} page. + +How Hangar Thinks About Data +---------------------------- + +### Abstraction 0: What is a Repository? + +A \"Repository\" consists of an historically ordered mapping of +\"Commits\" over time by various \"Committers\" across any number of +\"Branches\". Though there are many conceptual similarities in what a +Git repo and a Hangar Repository achieve, Hangar is designed with the +express purpose of dealing with numeric data. As such, when you +read/write to/from a Repository, the main way of interaction with +information will be through (an arbitrary number of) Columns in each +Commit. A simple key/value store is also included to store metadata, but +as it is a minor point is will largely be ignored for the rest of this +post. + +History exists at the Repository level, Information exists at the Commit +level. + +### Abstraction 1: What is a Dataset? + +Let\'s get philosophical and talk about what a \"Dataset\" is. The word +\"Dataset\" invokes some meaning to humans; a dataset may have a +canonical name (like \"MNIST\" or \"CoCo\"), it will have a source where +it comes from, (ideally) it has a purpose for some real-world task, it +will have people who build, aggregate, and nurture it, and most +importantly a Dataset always contains pieces of some type of information +type which describes \"something\". + +It\'s an abstract definition, but it is only us, the humans behind the +machine, which associate \"Data\" with some meaning in the real world; +it is in the same vein which we associate a group of Data in a +\"Dataset\" with some real world meaning. + +Our first abstraction is therefore the \"Dataset\": a collection of +(potentially groups of) data pieces observing a common form among +instances which act to describe something meaningful. \*To describe some +phenomenon, a dataset may require multiple pieces of information, each +of a particular format, for each instance/sample recorded in the +dataset.\* + +> **For Example** +> +> a Hospital will typically have a *Dataset* containing all of the CT +> scans performed over some period of time. A single CT scan is an +> instance, a single sample; however, once many are grouped together +> they form a *Dataset*. To expand on this simple view we realize that +> each CT scan consists of hundreds of pieces of information: +> +> > - Some large `numeric array` (the image data). +> > - Some smaller `numeric tuples` (describing image spacing, +> > dimension scale, capture time, machine parameters, etc). +> > - Many pieces of `string` data (the patient name, doctor name, +> > scan type, results found, etc). + +When thinking about the group of CT scans in aggregate, we realize that +though a single scan contains many disparate pieces of information stuck +together, when thinking about the aggregation of every scan in the +group, most of (if not all) of the same information fields are +duplicated within each samples. + +\*A single scan is a bunch of disparate information stuck together, many +of those put together makes a Dataset, but looking down from the top, we +identify pattern of common fields across all items. We call these +groupings of similar typed information:\* **Columns**. + +### Abstraction 2: What Makes up a Column? + +A `Dataset` is made of one or more `Columns` (and optionally some +`Metadata`), with each item placed in some `Column` belonging to and +making up an individual `Sample`. It is important to remember that all +data needed to fully describe a single `sample` in a `Dataset` may +consist of information spread across any number of `Columns`. To define +a `Column` in Hangar, we only need to provide: + +- a name +- a type +- a shape + +The individual pieces of information (`Data`) which fully describe some +phenomenon via an aggregate mapping access across any number of +\"Columns\" are both individually and collectively referred to as +`Samples` in the Hangar vernacular. According to the specification +above, all samples contained in a `Column` must be numeric arrays with +each having: + +1) Same data type (standard `numpy` data types are supported). +2) A shape with each dimension size \<= the shape (`max shape`) set in + the `column` specification (more on this later). + +Additionally, samples in a `column` can either be named, or unnamed +(depending on how you interpret what the information contained in the +`column` actually represents). + +Effective use of Hangar relies on having an understanding of what +exactly a `"Sample"` is in a particular `Column`. The most effective way +to find out is to ask: \"What is the smallest piece of data which has a +useful meaning to \'me\' (or \'my\' downstream processes\"). In the +MNIST `column`, this would be a single digit image (a 28x28 array); for +a medical `column` it might be an entire (512x320x320) MRI volume scan +for a particular patient; while for the NASDAQ Stock Ticker it might be +an hours worth of price data points (or less, or more!) The point is +that \*\*when you think about what a `sample` is, it should typically be +the smallest atomic unit of useful information.\*\* + +### Abstraction 3: What is Data? + +From this point forward, \*\*when we talk about \"Data\" we are actually +talking about n-dimensional arrays of numeric information. To Hangar, +\"Data\" is just a collection of numbers being passed into and out of +it.\*\* Data does not have a file type, it does not have a +file-extension, it does not mean anything to Hangar itself - it is just +numbers. This theory of \"Data\" is nearly as simple as it gets, and +this simplicity is what enables us to be unconstrained as we build +abstractions and utilities to operate on it. + +### Summary + +``` {.sourceCode .text} +A Dataset is thought of as containing Samples, but is actually defined by +Columns, which store parts of fully defined Samples in structures common +across the full aggregation of Dataset Samples. + +This can essentially be represented as a key -> tensor mapping, which can +(optionally) be Sparse depending on usage patterns + + Dataset + | + ----------------------------------------- + | | | | + Column 1 Column 2 Column 3 Column 4 + | | | | +------------------------------------------------------ + image | filename | label | annotation | +------------------------------------------------------ + S1 | S1 | | S1 | + S2 | S2 | S2 | S2 | + S3 | S3 | S3 | | + S4 | S4 | | | + +More techincally, a Dataset is just a view over the columns that gives you +sample tuples based on the cross product of keys and columns. Hangar doesn't +store or track the data set, just the underlying columns. + + S1 = (image[S1], filename[S1], annotation[S1]) + S2 = (image[S2], filename[S2], label[S2], annotation[S2]) + S3 = (image[S3], filename[S3], label[S3]) + S4 = (image[S4], filename[S4]) +``` + +!!! note + + The technical crowd among the readers should note: + + - Hangar preserves all sample data bit-exactly. + - Dense arrays are fully supported, Sparse array support is + currently under development and will be released soon. + - Integrity checks are built in by default (explained in more detail + in `ref-hangar-under-the-hood`{.interpreted-text role="ref"}.) + using cryptographically secure algorithms. + - Hangar is very much a young project, until penetration tests and + security reviews are performed, we will refrain from stating that + Hangar is fully \"cryptographically secure\". Security experts are + welcome to contact us privately at [hangar.info\@tensorwerk.com + \]{.title-ref}\_\_ to disclose any + security issues. + + +Implications of the Hangar Data Philosophy +------------------------------------------ + +### The Domain-Specific File Format Problem + +Though it may seem counterintuitive at first, there is an incredible +amount of freedom (and power) that is gained when \"you\" (the user) +start to decouple some information container from the data which it +actually holds. At the end of the day, the algorithms and systems you +use to produce insight from data are just mathematical operations; math +does not operate on a specific file type, math operates on numbers. + +#### Human & Computational Cost + +It seems strange that organizations & projects commonly rely on storing +data on disk in some domain-specific - or custom built - binary format +(ie. a `.jpg` image, `.nii` neuroimaging informatics study, `.cvs` +tabular data, etc.), and just deal with the hassle of maintaining all +the infrastructure around reading, writing, transforming, and +preprocessing these files into useable numerical data every time they +want to interact with their Columns. Even disregarding the computational +cost/overhead of preprocessing & transforming the data on every +read/write, these schemes require significant amounts of human capital +(developer time) to be spent on building, testing, and +upkeep/maintenance; all while adding significant complexity for users. +Oh, and they also have a strangely high inclination to degenerate into +horrible complexity which essentially becomes \"magic\" after the +original creators move on. + +The Hangar system is quite different in this regards. First, \*\*we +trust that you know what your data is and what it should be best +represented as\*\*. When writing to a Hangar repository, you process the +data into n-dimensional arrays once. Then when you retrieve it you are +provided with the same array, in the same shape and datatype (unless you +ask for a particular subarray-slice), already initialized in memory and +ready to compute on instantly. + +#### High Performance From Simplicity + +Because Hangar is designed to deal (almost exclusively) with numerical +arrays, we are able to \"stand on the shoulders of giants\" once again +by utilizing many of the well validated, highly optimized, and community +validated numerical array data management utilities developed by the +High Performance Computing community over the past few decades. + +In a sense, the backend of Hangar serves two functions: + +1) Bookkeeping: recording information about about columns, samples, + commits, etc. +2) Data Storage: highly optimized interfaces which store and retrieve + data from from disk through its backend utility. + +The details are explained much more thoroughly in +`ref-hangar-under-the-hood`{.interpreted-text role="ref"}. + +Because Hangar only considers data to be numbers, the choice of backend +to store data is (in a sense) completely arbitrary so long as +`Data In == Data Out`. **This fact has massive implications for the +system**; instead of being tied to a single backend (each of which will +have significant performance tradeoffs for arrays of particular +datatypes, shapes, and access patterns), we simultaneously store +different data pieces in the backend which is most suited to it. A great +deal of care has been taken to optimize parameters in the backend +interface which affects performance and compression of data samples. + +The choice of backend to store a piece of data is selected automatically +from heuristics based on the column specification, system details, and +context of the storage service internal to Hangar. \*\*As a user, this +is completely transparent to you\*\* in all steps of interacting with +the repository. It does not require (or even accept) user specified +configuration. + +At the time of writing, Hangar has the following backends implemented +(with plans to potentially support more as needs arise): + +1) [HDF5](https://www.hdfgroup.org/solutions/hdf5/) +2) [Memmapped + Arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.memmap.html) +3) [TileDb](https://tiledb.io/) (in development) + +### Open Source Software Style Collaboration in Dataset Curation + +#### Specialized Domain Knowledge is A Scarce Resource + +A common side effect of the [The Domain-Specific File Format +Problem](#the-domain-specific-file-format-problem) is that anyone who +wants to work with an organization\'s/project\'s data needs to not only +have some domain expertise (so they can do useful things with the data), +but they also need to have a non-trivial understanding of the projects +dataset, file format, and access conventions / transformation pipelines. +\*In a world where highly specialized talent is already scarce, this +phenomenon shrinks the pool of available collaborators dramatically.\* + +Given this situation, it\'s understandable why when most organizations +spend massive amounts of money and time to build a team, collect & +annotate data, and build an infrastructure around that information, they +hold it for their private use with little regards for how the world +could use it together. Businesses rely on proprietary information to +stay ahead of their competitors, and because this information is so +difficult (and expensive) to generate, it\'s completely reasonable that +they should be the ones to benefit from all that work. + +> **A Thought Experiment** +> +> Imagine that `Git` and `GitHub` didn\'t take over the world. Imagine +> that the `Diff` and `Patch` Unix tools never existed. Instead, imagine +> we were to live in a world where every software project had very +> different version control systems (largely homeade by non VCS experts, +> & not validated by a community over many years of use). Even worse, +> most of these tools don\'t allow users to easily branch, make changes, +> and automatically merge them back. It shouldn\'t be difficult to +> imagine how dramatically such a world would contrast to ours today. +> Open source software as we know it would hardly exist, and any efforts +> would probably be massively fragmented across the web (if there would +> even be a \'web\' that we would recognize in this strange world). +> +> Without a way to collaborate in the open, open source software would +> largely not exist, and we would all be worse off for it. +> +> Doesn\'t this hypothetical sound quite a bit like the state of open +> source data collaboration in todays world? + +The impetus for developing a tool like Hangar is the belief that if it +is simple for anyone with domain knowledge to collaboratively curate +columns containing information they care about, then they will.\* Open +source software development benefits everyone, we believe open source +column curation can do the same. + +#### How To Overcome The \"Size\" Problem + +Even if the greatest tool imaginable existed to version, branch, and +merge columns, it would face one massive problem which if it didn\'t +solve would kill the project: \*The size of data can very easily exceeds +what can fit on (most) contributors laptops or personal workstations\*. +This section explains how Hangar can handle working with columns which +are prohibitively large to download or store on a single machine. + +As mentioned in [High Performance From +Simplicity](#high-performance-from-simplicity), under the hood Hangar +deals with \"Data\" and \"Bookkeeping\" completely separately. We\'ve +previously covered what exactly we mean by Data in [How Hangar Thinks +About Data](#how-hangar-thinks-about-data), so we\'ll briefly cover the +second major component of Hangar here. In short \"Bookkeeping\" +describes everything about the repository. By everything, we do mean +that the Bookkeeping records describe everything: all commits, parents, +branches, columns, samples, data descriptors, schemas, commit message, +etc. Though complete, these records are fairly small (tens of MB in size +for decently sized repositories with decent history), and are highly +compressed for fast transfer between a Hangar client/server. + +> **A brief technical interlude** +> +> There is one very important (and rather complex) property which gives +> Hangar Bookeeping massive power: \*\*Existence of some data piece is +> always known to Hangar and stored immutably once committed. However, +> the access pattern, backend, and locating information for this data +> piece may (and over time, will) be unique in every hangar repository +> instance\*\*. +> +> Though the details of how this works is well beyond the scope of this +> document, the following example may provide some insight into the +> implications of this property: +> +> > If you `clone` some hangar repository, Bookeeping says that \"some +> > number of data pieces exist\" and they should retrieved from the +> > server. However, the bookeeping records transfered in a `fetch` / +> > `push` / `clone` operation do not include information about where +> > that piece of data existed on the client (or server) computer. Two +> > synced repositories can use completely different backends to store +> > the data, in completly different locations, and it does not matter - +> > Hangar only guarantees that when collaborators ask for a data sample +> > in some checkout, that they will be provided with identical arrays, +> > not that they will come from the same place or be stored in the same +> > way. Only when data is actually retrieved the \"locating +> > information\" is set for that repository instance. + +Because Hangar makes no assumptions about how/where it should retrieve +some piece of data, or even an assumption that it exists on the local +machine, and because records are small and completely describe history, +once a machine has the Bookkeeping, it can decide what data it actually +wants to materialize on it\'s local disk! These `partial fetch` / +`partial clone` operations can materialize any desired data, whether it +be for a few records at the head branch, for all data in a commit, or +for the entire historical data. A future release will even include the +ability to stream data directly to a Hangar checkout and materialize the +data in memory without having to save it to disk at all! + +More importantly: \*\*Since Bookkeeping describes all history, merging +can be performed between branches which may contain partial (or even no) +actual data.\*\* Aka **you don\'t need data on disk to merge changes +into it.** It\'s an odd concept which will be explained more in depth in +the future. + +..note : + + To try this out for yourself, please refer to the the API Docs + (:ref:`ref-api`) on working with Remotes, especially the ``fetch()`` and + ``fetch-data()`` methods. Otherwise look for through our tutorials & + examples for more practical info! + +#### What Does it Mean to \"Merge\" Data? + +We\'ll start this section, once again, with a comparison to source code +version control systems. When dealing with source code text, merging is +performed in order to take a set of changes made to a document, and +logically insert the changes into some other version of the document. +The goal is to generate a new version of the document with all changes +made to it in a fashion which conforms to the \"change author\'s\" +intentions. Simply put: the new version is valid and what is expected by +the authors. + +This concept of what it means to merge text does not generally map well +to changes made in a column we\'ll explore why through this section, but +look back to the philosophy of Data outlined in [How Hangar Thinks About +Data](#how-hangar-thinks-about-data) for inspiration as we begin. +Remember, in the Hangar design a Sample is the smallest array which +contains useful information. As any smaller selection of the sample +array is meaningless, Hangar does not support subarray-slicing or +per-index updates *when writing* data. (subarray-slice queries are +permitted for read operations, though regular use is discouraged and may +indicate that your samples are larger than they should be). + +##### Diffing Hangar Checkouts + +To understand merge logic, we first need to understand diffing, and the +actors operations which can occur. + +Addition + +: + +> An operation which creates a column, sample, or some metadata which +> did not previously exist in the relevant branch history. + +Removal + +: + +> An operation which removes some column, a sample, or some metadata +> which existed in the parent of the commit under consideration. (Note: +> removing a column also removes all samples contained in it). + +Mutation + +: + +> An operation which sets: data to a sample, the value of some metadata +> key, or a column schema, to a different value than what it had +> previously been created with (Note: a column schema mutation is +> observed when a column is removed, and a new column with the same name +> is created with a different dtype/shape, all in the same commit). + +##### Merging Changes + +Merging diffs solely consisting of additions and removals between +branches is trivial, and performs exactly as one would expect from a +text diff. Where things diverge from text is when we consider how we +will merge diffs containing mutations. + +Say we have some sample in commit A, a branch is created, the sample is +updated, and commit C is created. At the same time, someone else checks +out branch whose HEAD is at commit A, and commits a change to the sample +as well. If these changes are identical, they are compatible, but what +if they are not? In the following example, we diff and merge each +element of the sample array like we would text: + + Merge ?? + +> commit A commit B Does combining mean anything? + +> \[\[0, 1, 2\], \[\[0, 1, 2\], \[\[1, 1, 1\], +> +> : \[0, 1, 2\], \-\-\-\--\> \[2, 2, 2\], \-\-\-\-\-\-\-\-\-\-\--\> +> \[2, 2, 2\], \[0, 1, 2\]\] \[3, 3, 3\]\] / \[3, 3, 3\]\] / commit +> C / / \[\[1, 1, 1\], / \-\-\-\-\-\--\> \[0, 1, 2\], \[0, 1, 2\]\] +> +We see that a result can be generated, and can agree if this was a piece +of text, the result would be correct. Don\'t be fooled, this is an +abomination and utterly wrong/meaningless. Remember we said earlier +`"the result of a merge should conform to the intentions of each author"`. +This merge result conforms to neither author\'s intention. The value of +an array element is not isolated, every value affects how the entire +sample is understood. The values at Commit B or commit C may be fine on +their own, but if two samples are mutated independently with +non-identical updates, it is a conflict that needs to be handled by the +authors. + +This is the actual behavior of Hangar. + + commit A commit B + +> \[\[0, 1, 2\], \[\[0, 1, 2\], +> +> : \[0, 1, 2\], \-\-\-\--\> \[2, 2, 2\], \-\-\-\-- MERGE CONFLICT +> \[0, 1, 2\]\] \[3, 3, 3\]\] / / commit C / / \[\[1, 1, 1\], / +> \-\-\-\-\-\--\> \[0, 1, 2\], \[0, 1, 2\]\] +> +When a conflict is detected, the merge author must either pick a sample +from one of the commits or make changes in one of the branches such that +the conflicting sample values are resolved. + +##### How Are Conflicts Detected? + +Any merge conflicts can be identified and addressed ahead of running a +`merge` command by using the built in `diff` tools. When diffing +commits, Hangar will provide a list of conflicts which it identifies. In +general these fall into 4 categories: + +1) **Additions** in both branches which created new keys (samples / + columns / metadata) with non-compatible values. For samples & + metadata, the hash of the data is compared, for columns, the schema + specification is checked for compatibility in a method custom to the + internal workings of Hangar. +2) **Removal** in `Master Commit / Branch` **& Mutation** in + `Dev Commit / Branch`. Applies for samples, columns, and metadata + identically. +3) **Mutation** in `Dev Commit / Branch` **& Removal** in + `Master Commit / Branch`. Applies for samples, columns, and metadata + identically. +4) **Mutations** on keys both branches to non-compatible values. For + samples & metadata, the hash of the data is compared, for columns, + the schema specification is checked for compatibility in a method + custom to the internal workings of Hangar. + +What\'s Next? +------------- + +- Get started using Hangar today: `ref_installation`{.interpreted-text + role="ref"}. +- Read the tutorials: `ref-tutorial`{.interpreted-text role="ref"}. +- Dive into the details: `ref-hangar-under-the-hood`{.interpreted-text + role="ref"}. diff --git a/docs/concepts.rst b/docs/concepts.rst deleted file mode 100644 index c6bc5516..00000000 --- a/docs/concepts.rst +++ /dev/null @@ -1,581 +0,0 @@ -.. _ref-concepts: - -#################### -Hangar Core Concepts -#################### - -.. warning:: - - The usage info displayed in the ``latest`` build of the project - documentation do not reflect recent changes to the API and internal - structure of the project. They should not be relied on at the current - moment; they will be updated over the next weeks, and will be in line before - the next release. - -This document provides a high level overview of the problems Hangar is designed -to solve and introduces the core concepts for beginning to use Hangar. - -*************** -What Is Hangar? -*************** - -At its core Hangar is designed to solve many of the same problems faced by -traditional code version control system (ie. ``Git``), just adapted for -numerical data: - -* Time travel through the historical evolution of a dataset -* Zero-cost Branching to enable exploratory analysis and collaboration -* Cheap Merging to build datasets over time (with multiple collaborators) -* Completely abstracted organization and management of data files on disk -* Ability to only retrieve a small portion of the data (as needed) while still - maintaining complete historical record -* Ability to push and pull changes directly to collaborators or a central - server (ie. a truly distributed version control system) - -The ability of version control systems to perform these tasks for codebases is -largely taken for granted by almost every developer today; however, we are -in-fact standing on the shoulders of giants, with decades of engineering which -has resulted in these phenomenally useful tools. Now that a new era of -"Data-Defined software" is taking hold, we find there is a strong need for -analogous version control systems which are designed to handle numerical data -at large scale... Welcome to Hangar! - -*********** -Inspiration -*********** - -The design of Hangar was heavily influenced by the `Git `_ -source-code version control system. As a Hangar user, many of the fundamental -building blocks and commands can be thought of as interchangeable: - -* checkout -* commit -* branch -* merge -* diff -* push -* pull/fetch -* log - -Emulating the high level the git syntax has allowed us to create a user -experience which should be familiar in many ways to Hangar users; a goal of the -project is to enable many of the same VCS workflows developers use for code -while working with their data! - -There are, however, many fundamental differences in how humans/programs -interpret and use text in source files vs. numerical data which raise many -questions Hangar needs to uniquely solve: - -* How do we connect some piece of "Data" with a meaning in the real world? -* How do we diff and merge large collections of data samples? -* How can we resolve conflicts? -* How do we make data access (reading and writing) convenient for both - user-driven exploratory analyses and high performance production systems - operating without supervision? -* How can we enable people to work on huge datasets in a local (laptop grade) - development environment? - -We will show how Hangar solves these questions in a high-level guide below. -For a deep dive into the Hangar internals, we invite you to check out the -:ref:`ref-hangar-under-the-hood` page. - -**************************** -How Hangar Thinks About Data -**************************** - -Abstraction 0: What is a Repository? -==================================== - -A "Repository" consists of an historically ordered mapping of "Commits" over -time by various "Committers" across any number of "Branches". Though there are -many conceptual similarities in what a Git repo and a Hangar Repository -achieve, Hangar is designed with the express purpose of dealing with numeric -data. As such, when you read/write to/from a Repository, the main way of -interaction with information will be through (an arbitrary number of) Columns -in each Commit. A simple key/value store is also included to store metadata, -but as it is a minor point is will largely be ignored for the rest of this -post. - -History exists at the Repository level, Information exists at the Commit level. - -Abstraction 1: What is a Dataset? -================================= - -Let's get philosophical and talk about what a "Dataset" is. The word "Dataset" -invokes some meaning to humans; a dataset may have a canonical name (like -"MNIST" or "CoCo"), it will have a source where it comes from, (ideally) it has -a purpose for some real-world task, it will have people who build, aggregate, -and nurture it, and most importantly a Dataset always contains pieces of some -type of information type which describes "something". - -It's an abstract definition, but it is only us, the humans behind the machine, -which associate "Data" with some meaning in the real world; it is in the same -vein which we associate a group of Data in a "Dataset" with some real world -meaning. - -Our first abstraction is therefore the "Dataset": a collection of (potentially -groups of) data pieces observing a common form among instances which act to -describe something meaningful. *To describe some phenomenon, a dataset may -require multiple pieces of information, each of a particular format, for each -instance/sample recorded in the dataset.* - - **For Example** - - a Hospital will typically have a *Dataset* containing all of the CT scans - performed over some period of time. A single CT scan is an instance, a - single sample; however, once many are grouped together they form a - *Dataset*. To expand on this simple view we realize that each CT scan - consists of hundreds of pieces of information: - - * Some large ``numeric array`` (the image data). - * Some smaller ``numeric tuples`` (describing image spacing, dimension - scale, capture time, machine parameters, etc). - * Many pieces of ``string`` data (the patient name, doctor name, scan - type, results found, etc). - -When thinking about the group of CT scans in aggregate, we realize that -though a single scan contains many disparate pieces of information stuck -together, when thinking about the aggregation of every scan in the group, -most of (if not all) of the same information fields are duplicated within -each samples. - -*A single scan is a bunch of disparate information stuck together, many of -those put together makes a Dataset, but looking down from the top, we identify -pattern of common fields across all items. We call these groupings of similar -typed information:* **Columns**. - -Abstraction 2: What Makes up a Column? -====================================== - -A ``Dataset`` is made of one or more ``Columns`` (and optionally some -``Metadata``), with each item placed in some ``Column`` belonging to and -making up an individual ``Sample``. It is important to remember that all data -needed to fully describe a single ``sample`` in a ``Dataset`` may consist of -information spread across any number of ``Columns``. To define a ``Column`` -in Hangar, we only need to provide: - -* a name -* a type -* a shape - -The individual pieces of information (``Data``) which fully describe some -phenomenon via an aggregate mapping access across any number of "Columns" are -both individually and collectively referred to as ``Samples`` in the Hangar -vernacular. According to the specification above, all samples contained in a -``Column`` must be numeric arrays with each having: - -1) Same data type (standard ``numpy`` data types are supported). -2) A shape with each dimension size <= the shape (``max shape``) set in the - ``column`` specification (more on this later). - -Additionally, samples in a ``column`` can either be named, or unnamed -(depending on how you interpret what the information contained in the -``column`` actually represents). - - - -Effective use of Hangar relies on having an understanding of what exactly a -``"Sample"`` is in a particular ``Column``. The most effective way to find -out is to ask: "What is the smallest piece of data which has a useful meaning -to 'me' (or 'my' downstream processes"). In the MNIST ``column``, this would -be a single digit image (a 28x28 array); for a medical ``column`` it might be -an entire (512x320x320) MRI volume scan for a particular patient; while for the -NASDAQ Stock Ticker it might be an hours worth of price data points (or less, -or more!) The point is that **when you think about what a ``sample`` is, it -should typically be the smallest atomic unit of useful information.** - -Abstraction 3: What is Data? -============================ - -From this point forward, **when we talk about "Data" we are actually talking -about n-dimensional arrays of numeric information. To Hangar, "Data" is just a -collection of numbers being passed into and out of it.** Data does not have a -file type, it does not have a file-extension, it does not mean anything to -Hangar itself - it is just numbers. This theory of "Data" is nearly as simple -as it gets, and this simplicity is what enables us to be unconstrained as we -build abstractions and utilities to operate on it. - -Summary -======= - -.. code-block:: text - - A Dataset is thought of as containing Samples, but is actually defined by - Columns, which store parts of fully defined Samples in structures common - across the full aggregation of Dataset Samples. - - This can essentially be represented as a key -> tensor mapping, which can - (optionally) be Sparse depending on usage patterns - - Dataset - | - ----------------------------------------- - | | | | - Column 1 Column 2 Column 3 Column 4 - | | | | - ------------------------------------------------------ - image | filename | label | annotation | - ------------------------------------------------------ - S1 | S1 | | S1 | - S2 | S2 | S2 | S2 | - S3 | S3 | S3 | | - S4 | S4 | | | - - More techincally, a Dataset is just a view over the columns that gives you - sample tuples based on the cross product of keys and columns. Hangar doesn't - store or track the data set, just the underlying columns. - - S1 = (image[S1], filename[S1], annotation[S1]) - S2 = (image[S2], filename[S2], label[S2], annotation[S2]) - S3 = (image[S3], filename[S3], label[S3]) - S4 = (image[S4], filename[S4]) - - -.. note:: - - The technical crowd among the readers should note: - - * Hangar preserves all sample data bit-exactly. - * Dense arrays are fully supported, Sparse array support is currently - under development and will be released soon. - * Integrity checks are built in by default (explained in more detail in - :ref:`ref-hangar-under-the-hood`.) using cryptographically secure - algorithms. - * Hangar is very much a young project, until penetration tests and - security reviews are performed, we will refrain from stating that Hangar - is fully "cryptographically secure". Security experts are welcome to - contact us privately at `hangar.info@tensorwerk.com - `__ to disclose any security issues. - - -****************************************** -Implications of the Hangar Data Philosophy -****************************************** - -The Domain-Specific File Format Problem -======================================= - -Though it may seem counterintuitive at first, there is an incredible -amount of freedom (and power) that is gained when "you" (the user) start to -decouple some information container from the data which it actually holds. At -the end of the day, the algorithms and systems you use to produce insight from -data are just mathematical operations; math does not operate on a specific file -type, math operates on numbers. - -Human & Computational Cost --------------------------- - -It seems strange that organizations & projects commonly rely on storing data on -disk in some domain-specific - or custom built - binary format (ie. a ``.jpg`` -image, ``.nii`` neuroimaging informatics study, ``.cvs`` tabular data, etc.), -and just deal with the hassle of maintaining all the infrastructure around -reading, writing, transforming, and preprocessing these files into useable -numerical data every time they want to interact with their Columns. Even -disregarding the computational cost/overhead of preprocessing & transforming -the data on every read/write, these schemes require significant amounts of -human capital (developer time) to be spent on building, testing, and -upkeep/maintenance; all while adding significant complexity for users. Oh, and -they also have a strangely high inclination to degenerate into horrible -complexity which essentially becomes "magic" after the original creators move -on. - -The Hangar system is quite different in this regards. First, **we trust that -you know what your data is and what it should be best represented as**. When -writing to a Hangar repository, you process the data into n-dimensional arrays -once. Then when you retrieve it you are provided with the same array, in the -same shape and datatype (unless you ask for a particular subarray-slice), -already initialized in memory and ready to compute on instantly. - -High Performance From Simplicity --------------------------------- - -Because Hangar is designed to deal (almost exclusively) with numerical arrays, -we are able to "stand on the shoulders of giants" once again by utilizing many -of the well validated, highly optimized, and community validated numerical -array data management utilities developed by the High Performance Computing -community over the past few decades. - -In a sense, the backend of Hangar serves two functions: - -1) Bookkeeping: recording information about about columns, samples, commits, - etc. -2) Data Storage: highly optimized interfaces which store and retrieve data from - from disk through its backend utility. - -The details are explained much more thoroughly in -:ref:`ref-hangar-under-the-hood`. - -Because Hangar only considers data to be numbers, the choice of backend to -store data is (in a sense) completely arbitrary so long as ``Data In == Data -Out``. **This fact has massive implications for the system**; instead of being -tied to a single backend (each of which will have significant performance -tradeoffs for arrays of particular datatypes, shapes, and access patterns), we -simultaneously store different data pieces in the backend which is most suited -to it. A great deal of care has been taken to optimize parameters in the -backend interface which affects performance and compression of data samples. - -The choice of backend to store a piece of data is selected automatically from -heuristics based on the column specification, system details, and context of -the storage service internal to Hangar. **As a user, this is completely -transparent to you** in all steps of interacting with the repository. It does -not require (or even accept) user specified configuration. - -At the time of writing, Hangar has the following backends implemented (with -plans to potentially support more as needs arise): - -1) `HDF5 `_ -2) `Memmapped Arrays `_ -3) `TileDb `_ (in development) - - -Open Source Software Style Collaboration in Dataset Curation -============================================================= - -Specialized Domain Knowledge is A Scarce Resource -------------------------------------------------- - -A common side effect of the `The Domain-Specific File Format Problem`_ is that -anyone who wants to work with an organization's/project's data needs to not -only have some domain expertise (so they can do useful things with the data), -but they also need to have a non-trivial understanding of the projects -dataset, file format, and access conventions / transformation pipelines. *In a -world where highly specialized talent is already scarce, this phenomenon -shrinks the pool of available collaborators dramatically.* - -Given this situation, it's understandable why when most organizations spend -massive amounts of money and time to build a team, collect & annotate data, and -build an infrastructure around that information, they hold it for their private -use with little regards for how the world could use it together. Businesses -rely on proprietary information to stay ahead of their competitors, and because -this information is so difficult (and expensive) to generate, it's completely -reasonable that they should be the ones to benefit from all that work. - - **A Thought Experiment** - - Imagine that ``Git`` and ``GitHub`` didn't take over the world. Imagine - that the ``Diff`` and ``Patch`` Unix tools never existed. Instead, imagine - we were to live in a world where every software project had very different - version control systems (largely homeade by non VCS experts, & not - validated by a community over many years of use). Even worse, most of these - tools don't allow users to easily branch, make changes, and automatically - merge them back. It shouldn't be difficult to imagine how dramatically such - a world would contrast to ours today. Open source software as we know it - would hardly exist, and any efforts would probably be massively fragmented - across the web (if there would even be a 'web' that we would recognize in - this strange world). - - Without a way to collaborate in the open, open source software would - largely not exist, and we would all be worse off for it. - - Doesn't this hypothetical sound quite a bit like the state of open source - data collaboration in todays world? - -The impetus for developing a tool like Hangar is the belief that if it is -simple for anyone with domain knowledge to collaboratively curate columns -containing information they care about, then they will.* Open source software -development benefits everyone, we believe open source column curation can do -the same. - -How To Overcome The "Size" Problem ----------------------------------- - -Even if the greatest tool imaginable existed to version, branch, and merge -columns, it would face one massive problem which if it didn't solve would -kill the project: *The size of data can very easily exceeds what can fit on -(most) contributors laptops or personal workstations*. This section explains -how Hangar can handle working with columns which are prohibitively large to -download or store on a single machine. - -As mentioned in `High Performance From Simplicity`_, under the hood Hangar -deals with "Data" and "Bookkeeping" completely separately. We've previously -covered what exactly we mean by Data in `How Hangar Thinks About Data`_, so -we'll briefly cover the second major component of Hangar here. In short -"Bookkeeping" describes everything about the repository. By everything, we do -mean that the Bookkeeping records describe everything: all commits, parents, -branches, columns, samples, data descriptors, schemas, commit message, etc. -Though complete, these records are fairly small (tens of MB in size for -decently sized repositories with decent history), and are highly compressed for -fast transfer between a Hangar client/server. - - **A brief technical interlude** - - There is one very important (and rather complex) property which gives - Hangar Bookeeping massive power: **Existence of some data piece is always - known to Hangar and stored immutably once committed. However, the access - pattern, backend, and locating information for this data piece may (and - over time, will) be unique in every hangar repository instance**. - - Though the details of how this works is well beyond the scope of this - document, the following example may provide some insight into the - implications of this property: - - If you ``clone`` some hangar repository, Bookeeping says that "some - number of data pieces exist" and they should retrieved from the server. - However, the bookeeping records transfered in a ``fetch`` / ``push`` / - ``clone`` operation do not include information about where that piece - of data existed on the client (or server) computer. Two synced - repositories can use completely different backends to store the data, in - completly different locations, and it does not matter - Hangar only - guarantees that when collaborators ask for a data sample in some - checkout, that they will be provided with identical arrays, not that - they will come from the same place or be stored in the same way. Only - when data is actually retrieved the "locating information" is set for - that repository instance. - -Because Hangar makes no assumptions about how/where it should retrieve some -piece of data, or even an assumption that it exists on the local machine, and -because records are small and completely describe history, once a machine has -the Bookkeeping, it can decide what data it actually wants to materialize on -it's local disk! These ``partial fetch`` / ``partial clone`` operations can -materialize any desired data, whether it be for a few records at the head -branch, for all data in a commit, or for the entire historical data. A future -release will even include the ability to stream data directly to a Hangar -checkout and materialize the data in memory without having to save it to disk -at all! - -More importantly: **Since Bookkeeping describes all history, merging can be -performed between branches which may contain partial (or even no) actual -data.** Aka **you don't need data on disk to merge changes into it.** It's an odd -concept which will be explained more in depth in the future. - -..note :: - - To try this out for yourself, please refer to the the API Docs - (:ref:`ref-api`) on working with Remotes, especially the ``fetch()`` and - ``fetch-data()`` methods. Otherwise look for through our tutorials & - examples for more practical info! - -What Does it Mean to "Merge" Data? ----------------------------------- - -We'll start this section, once again, with a comparison to source code version -control systems. When dealing with source code text, merging is performed in -order to take a set of changes made to a document, and logically insert the -changes into some other version of the document. The goal is to generate a new -version of the document with all changes made to it in a fashion which conforms -to the "change author's" intentions. Simply put: the new version is valid and -what is expected by the authors. - -This concept of what it means to merge text does not generally map well to -changes made in a column we'll explore why through this section, but look -back to the philosophy of Data outlined in `How Hangar Thinks About Data`_ for -inspiration as we begin. Remember, in the Hangar design a Sample is the -smallest array which contains useful information. As any smaller selection of -the sample array is meaningless, Hangar does not support subarray-slicing or -per-index updates *when writing* data. (subarray-slice queries are permitted -for read operations, though regular use is discouraged and may indicate that -your samples are larger than they should be). - -Diffing Hangar Checkouts -^^^^^^^^^^^^^^^^^^^^^^^^ - -To understand merge logic, we first need to understand diffing, and the actors -operations which can occur. - -:Addition: - - An operation which creates a column, sample, or some metadata which - did not previously exist in the relevant branch history. - -:Removal: - - An operation which removes some column, a sample, or some metadata which - existed in the parent of the commit under consideration. (Note: removing a - column also removes all samples contained in it). - -:Mutation: - - An operation which sets: data to a sample, the value of some metadata key, - or a column schema, to a different value than what it had previously been - created with (Note: a column schema mutation is observed when a column - is removed, and a new column with the same name is created with a - different dtype/shape, all in the same commit). - -Merging Changes -^^^^^^^^^^^^^^^ - -Merging diffs solely consisting of additions and removals between branches is -trivial, and performs exactly as one would expect from a text diff. Where -things diverge from text is when we consider how we will merge diffs containing -mutations. - -Say we have some sample in commit A, a branch is created, the sample is -updated, and commit C is created. At the same time, someone else checks out -branch whose HEAD is at commit A, and commits a change to the sample as well. -If these changes are identical, they are compatible, but what if they are not? -In the following example, we diff and merge each element of the sample array -like we would text: - -:: - - Merge ?? - commit A commit B Does combining mean anything? - - [[0, 1, 2], [[0, 1, 2], [[1, 1, 1], - [0, 1, 2], -----> [2, 2, 2], ------------> [2, 2, 2], - [0, 1, 2]] [3, 3, 3]] / [3, 3, 3]] - \ / - \ commit C / - \ / - \ [[1, 1, 1], / - -------> [0, 1, 2], - [0, 1, 2]] - -We see that a result can be generated, and can agree if this was a piece of -text, the result would be correct. Don't be fooled, this is an abomination and -utterly wrong/meaningless. Remember we said earlier ``"the result of a merge -should conform to the intentions of each author"``. This merge result conforms -to neither author's intention. The value of an array element is not isolated, -every value affects how the entire sample is understood. The values at Commit B -or commit C may be fine on their own, but if two samples are mutated -independently with non-identical updates, it is a conflict that needs to be -handled by the authors. - -This is the actual behavior of Hangar. - -:: - - commit A commit B - - [[0, 1, 2], [[0, 1, 2], - [0, 1, 2], -----> [2, 2, 2], ----- MERGE CONFLICT - [0, 1, 2]] [3, 3, 3]] / - \ / - \ commit C / - \ / - \ [[1, 1, 1], / - -------> [0, 1, 2], - [0, 1, 2]] - -When a conflict is detected, the merge author must either pick a sample from -one of the commits or make changes in one of the branches such that the -conflicting sample values are resolved. - -How Are Conflicts Detected? -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Any merge conflicts can be identified and addressed ahead of running a -``merge`` command by using the built in ``diff`` tools. When diffing commits, -Hangar will provide a list of conflicts which it identifies. In general these -fall into 4 categories: - -1) **Additions** in both branches which created new keys (samples / columns / - metadata) with non-compatible values. For samples & metadata, the hash of - the data is compared, for columns, the schema specification is checked for - compatibility in a method custom to the internal workings of Hangar. -2) **Removal** in ``Master Commit / Branch`` **& Mutation** in ``Dev Commit / - Branch``. Applies for samples, columns, and metadata identically. -3) **Mutation** in ``Dev Commit / Branch`` **& Removal** in ``Master Commit / - Branch``. Applies for samples, columns, and metadata identically. -4) **Mutations** on keys both branches to non-compatible values. For samples & - metadata, the hash of the data is compared, for columns, the schema - specification is checked for compatibility in a method custom to the - internal workings of Hangar. - -************ -What's Next? -************ - -* Get started using Hangar today: :ref:`ref_installation`. -* Read the tutorials: :ref:`ref-tutorial`. -* Dive into the details: :ref:`ref-hangar-under-the-hood`. diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 52a545c9..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,126 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import os - - -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.coverage', - 'sphinx.ext.doctest', - 'sphinx.ext.extlinks', - 'sphinx.ext.ifconfig', - 'sphinx.ext.napoleon', - 'sphinx.ext.todo', - 'sphinx.ext.intersphinx', - 'sphinx_click.ext', - 'nbsphinx', - 'sphinx_copybutton', - 'sphinx.ext.mathjax', - 'recommonmark', - 'IPython.sphinxext.ipython_console_highlighting', -] - - -if os.getenv('SPELLCHECK'): - extensions += 'sphinxcontrib.spelling', - spelling_show_suggestions = True - spelling_lang = 'en_US' - -# to exclude traditional Python prompts from your copied code -copybutton_prompt_text = ">>> " -# All lines of the code blocks will be copied after the prompts are stripped. -copybutton_only_copy_prompt_lines = False - - -nbsphinx_execute = 'never' - -autodoc_mock_imports = ['torch', 'tensorflow'] -autosummary_generate = True - -source_suffix = { - '.rst': 'restructuredtext', - '.txt': 'markdown', - '.md': 'markdown', -} -master_doc = 'index' -project = 'Hangar' -year = '2019-2020' -author = 'Richard Izzo' -copyright = '{0}, {1}'.format(year, author) -version = release = '0.5.2' - -pygments_style = 'default' -pygments_lexer = 'PythonConsoleLexer' -highlight_options = { - 'python3': True -} -templates_path = ['.'] -exclude_patterns = ['_build', '**.ipynb_checkpoints'] -extlinks = { - 'issue': ('https://github.com/tensorwerk/hangar-py/issues/%s', '#'), - 'pr': ('https://github.com/tensorwerk/hangar-py/pull/%s', 'PR #'), -} -intersphinx_mapping = { - 'python': ('https://docs.python.org/3', None), - 'torch': ('https://pytorch.org/docs/master', None), - 'numpy': ('http://docs.scipy.org/doc/numpy', None), -} - -# Regular expressions that match URIs that should not be checked -# when doing a linkcheck build -linkcheck_ignore = [ - r'http://localhost:\d+/?', 'http://localhost/', - 'https://github.com/tensorwerk/hangar-py', - r'https://github.com/tensorwerk/hangar-py/.*', - r'http://tensorwerk.com/hangar-benchmarks/', - r'https://tensorwerk.com/hangar-benchmarks', -] -linkcheck_retries = 3 - -# on_rtd is whether we are on readthedocs.org -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' - -# if not on_rtd: # only set the theme if we're building docs locally -# html_theme = 'sphinx_rtd_theme' -html_theme = 'sphinx_material' - -html_sidebars = { - "**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"] -} - -html_short_title = '%s-%s' % (project, version) - -napoleon_use_ivar = True -napoleon_use_rtype = True -napoleon_use_param = True -napoleon_include_init_with_doc = True - -add_module_names = False -doctest_test_doctest_blocks = None -autoclass_content = 'class' - -# Material theme options (see theme.conf for more information) -html_theme_options = { - - # Set the name of the project to appear in the navigation. - 'nav_title': 'Hangar', - - # Set the color and the accent color - 'color_primary': 'deep-purple', - 'color_accent': 'blue', - - # Set the repo location to get a badge with stats - 'repo_url': 'https://github.com/tensorwerk/hangar-py/', - 'repo_name': 'Hangar', - 'repo_type': 'github', - - - # Visible levels of the global TOC; -1 means unlimited - 'globaltoc_depth': -1, - # If False, expand all TOC entries - 'globaltoc_collapse': True, - # If True, show hidden TOC entries - 'globaltoc_includehidden': True, -} diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 00000000..38e9ec49 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,94 @@ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every +little bit helps, and credit will always be given. + +All community members should read and abide by our +`ref-code-of-conduct`{.interpreted-text role="ref"}. + +Bug reports +----------- + +When [reporting a bug](https://github.com/tensorwerk/hangar-py/issues) +please include: + +> - Your operating system name and version. +> - Any details about your local setup that might be helpful in +> troubleshooting. +> - Detailed steps to reproduce the bug. + +Documentation improvements +-------------------------- + +Hangar could always use more documentation, whether as part of the +official Hangar docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Feature requests and feedback +----------------------------- + +The best way to send feedback is to file an issue at +. + +If you are proposing a feature: + +- Explain in detail how it would work. +- Keep the scope as narrow as possible, to make it easier to + implement. +- Remember that this is a volunteer-driven project, and that code + contributions are welcome :) + +Development +----------- + +To set up [hangar-py]{.title-ref} for local development: + +1. Fork [hangar-py](https://github.com/tensorwerk/hangar-py) (look for + the \"Fork\" button). +2. Clone your fork locally: + + git clone git@github.com:your_name_here/hangar-py.git + +3. Create a branch for local development: + + git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +4. When you\'re done making changes, run all the checks, doc builder + and spell checker with + [tox](http://tox.readthedocs.io/en/latest/install.html) one command: + + tox + +5. Commit your changes and push your branch to GitHub: + + git add . + git commit -m "Your detailed description of your changes." + git push origin name-of-your-bugfix-or-feature + +6. Submit a pull request through the GitHub website. + +### Pull Request Guidelines + +If you need some code review or feedback while you\'re developing the +code just make the pull request. + +For merging, you should: + +1. Include passing tests (run `tox`). +2. Update documentation when there\'s new API, functionality etc. +3. Add a note to `CHANGELOG.rst` about the changes. +4. Add yourself to `AUTHORS.rst`. + +### Tips + +To run a subset of tests: + + tox -e envname -- pytest -k test_myfeature + +To run all the test environments in *parallel* (you need to +`pip install detox`): + + detox diff --git a/docs/contributing.rst b/docs/contributing.rst deleted file mode 100644 index 3bdd7dc2..00000000 --- a/docs/contributing.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../CONTRIBUTING.rst \ No newline at end of file diff --git a/docs/contributingindex.rst b/docs/contributingindex.rst deleted file mode 100644 index 88a92def..00000000 --- a/docs/contributingindex.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _ref-contributing: - -###################### -Contributing to Hangar -###################### - -.. toctree:: - :maxdepth: 2 - - contributing - codeofconduct - benchmarking diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 00000000..1464fa60 --- /dev/null +++ b/docs/design.md @@ -0,0 +1,317 @@ +Hangar Under The Hood +===================== + +At its core, Hangar is a content addressable data store whose design +requirements were inspired by the Git version control system. + +Things In Life Change, Your Data Shouldn't +------------------------------------------ + +When designing a high performance data version control system, achieving +performance goals while ensuring consistency is incredibly difficult. +Memory is fast, disk is slow; not much we can do about it. But since +Hangar should deal with any numeric data in an array of any size (with +an enforced limit of 31 dimensions in a sample\...) we have to find ways +to work *with* the disk, not against it. + +Upon coming to terms with this face, we are actually presented with a +problem once we realize that we live in the real world, and real world +is ugly. Computers crash, processes get killed, and people do \* +*interesting* \* things. Because of this, It is a foundational design +principle for us to *guarantee that once Hangar says data has been +successfully added to the repository, it is actually persisted.* This +essentially means that any process which interacts with data records on +disk must be stateless. If (for example) we were to keep a record of all +data added to the staging area in an in-memory list, and the process +gets killed, we may have just lost references to all of the array data, +and may not even be sure that the arrays were flushed to disk properly. +These situations are a NO-GO from the start, and will always remain so. + +So, we come to the first design choice: **read and write actions are +atomic**. Once data is added to a Hangar repository, the numeric array +along with the necessary book-keeping records will *always* occur +transactionally, ensuring that when something unexpected happens, the +data and records are committed to disk. + +!!! note + + The atomicity of interactions is completely hidden from a normal user; + they shouldn't have to care about this or even know this exists. + However, this is also why using the context-manager style column + interaction scheme can result in \~2x times speedup on writes/reads. We + can just pass on most of the work to the Python `contextlib` package + instead of having to begin and commit/abort (depending on interaction + mode) transactions with every call to an [add]{.title-ref} or + [get]{.title-ref} method. + +Data Is Large, We Don't Waste Space +----------------------------------- + +From the very beginning we knew that while it would be easy to just +store all data in every commit as independent arrays on disk, such a +naive implementation would just absolutely eat up disk space for any +repository with a non-trivial history. Hangar commits should be fast and +use minimal disk space, duplicating data just doesn't make sense for +such a system. And so we decided on implementing a content addressable +data store backend. + +When a user requests to add data to a Hangar repository, one of the +first operations which occur is to generate a hash of the array +contents. If the hash does not match a piece of data already placed in +the Hangar repository, the data is sent to the appropriate storage +backend methods. On success, the backend sends back some arbitrary +specification which can be used to retrieve that same piece of data from +that particular backend. The record backend then stores a key/value pair +of ([hash]{.title-ref}, [backend_specification]{.title-ref}). + +!!! note + + The record backend stores hash information in a separate location from + the commit references (which associate a [(columnname, sample + name/id)]{.title-ref} to a [sample_hash]{.title-ref}). This let's us + separate the historical repository information from a particular + computer's location of a data piece. All we need in the public history + is to know that some data with a particular hash is associated with a + commit. No one but the system which actually needs to access the data + needs to know where it can be found. + +On the other hand, if a data sample is added to a repository which +already has a record of some hash, we don't even involve the storage +backend. All we need to do is just record that a new sample in a column +was added with that hash. It makes no sense to write the same data +twice. + +This method can actually result in massive space savings for some common +use cases. For the MNIST column, the training label data is typically a +1D-array of size 50,000. Because there are only 10 labels, we only need +to store 10 ints on disk, and just keep references to the rest. + +The Basics of Collaboration: Branching and Merging +-------------------------------------------------- + +Up to this point, we haven't actually discussed much about how data and +records are treated on disk. We'll leave an entire walkthrough of the +backend record structure for another tutorial, but let's introduce the +basics here, and see how we enable the types of branching and merging +operations you might be used to with source code (at largely the same +speed!). + +Here's a few core principles to keep in mind: + +### Numbers == Numbers + +Hangar has no concept of what a piece of data is outside of a string of +bytes / numerical array, and most importantly, *hangar does not care*; +Hangar is a tool, and we leave it up to you to know what your data +actually means)! + +At the end of the day when the data is placed into *some* collection on +disk, the storage backend we use won't care either. In fact, this is +the entire reason why Hangar can do what it can; we don't attempt to +treat data as anything other then a series of bytes on disk! + +The fact that *Hangar does not care about what your data represents* is +a fundamental underpinning of how the system works under the hood. It is +the *designed and intended behavior* of Hangar to dump arrays to disk in +what would seem like completely arbitrary buffers/locations to an +outside observer. And for the most part, they would be essentially +correct in their observation that data samples on disk are in strange +locations. + +While there is almost no organization or hierarchy for the actual data +samples when they are stored on disk, that is not to say that they are +stored without care! We may not care about global trends, but we do care +a great deal about the byte order/layout, sequentiality, +chunking/compression and validations operations which are applied across +the bytes which make up a data sample. + +In other words, we optimize for utility and performance on the backend, +not so that a human can understand the file format without a computer! +After the array has been saved to disk, all we care about is that +bookkeeper can record some unique information about where some piece of +content is, and how we can read it. *None of that information is +stored alongside the data itself - Remember: numbers are just numbers - +they don't have any concept of what they are* + +### Records != Numbers + +*The form numerical data takes once dumped on disk is completely +irrelevant to the specifications of records in the repository history.* + +Now, let's unpack this for a bit. We know from [Numbers == +Numbers](#numbers-numbers) that data is saved to disk in some arbitrary +locations with some arbitrary backend. We also know from [Data Is Large, +We Don't Waste Space](#data-is-large-we-dont-waste-space) that the +permanent repository information only contains a record which links a +sample name to a hash. We also assert that there is also a mapping of +hash to storage backend specification kept somewhere (doesn't matter +what that mapping is for the moment). With those 3 pieces of +information, it's obvious that once data is placed in the repository, +we don't actually need to interact with it to understand the accounting +of what was added when! + +In order to make a commit, we just pack up all the records which existed +in the staging area, create a hash of the records (including the hash of +any parent commits), and then store the commit hash mapping alongside +details such as the commit user/email and commit message, and a +compressed version of the full commit records as they existed at that +point in time. + +!!! note + + That last point "storing a compressed version of the full commit + records", is semi inefficient, and will be changed in the future so + that unchanged records are note duplicated across commits. + +An example is given below of the keys -\> values mapping which stores +each of the staged records, and which are packed up / compressed on +commit (and subsequently unpacked on checkout!). + + Num asets 'a.' -> '2' + --------------------------------------------------------------------------- + Name of aset -> num samples || 'a.train_images' -> '10' + Name of data -> hash || 'a.train_images.0' -> BAR_HASH_1' + Name of data -> hash || 'a.train_images.1' -> BAR_HASH_2' + Name of data -> hash || 'a.train_images.2' -> BAR_HASH_3' + Name of data -> hash || 'a.train_images.3' -> BAR_HASH_4' + Name of data -> hash || 'a.train_images.4' -> BAR_HASH_5' + Name of data -> hash || 'a.train_images.5' -> BAR_HASH_6' + Name of data -> hash || 'a.train_images.6' -> BAR_HASH_7' + Name of data -> hash || 'a.train_images.7' -> BAR_HASH_8' + Name of data -> hash || 'a.train_images.8' -> BAR_HASH_9' + Name of data -> hash || 'a.train_images.9' -> BAR_HASH_0' + --------------------------------------------------------------------------- + Name of aset -> num samples || 'a.train_labels' -> '10' + Name of data -> hash || 'a.train_labels.0' -> BAR_HASH_11' + Name of data -> hash || 'a.train_labels.1' -> BAR_HASH_12' + Name of data -> hash || 'a.train_labels.2' -> BAR_HASH_13' + Name of data -> hash || 'a.train_labels.3' -> BAR_HASH_14' + Name of data -> hash || 'a.train_labels.4' -> BAR_HASH_15' + Name of data -> hash || 'a.train_labels.5' -> BAR_HASH_16' + Name of data -> hash || 'a.train_labels.6' -> BAR_HASH_17' + Name of data -> hash || 'a.train_labels.7' -> BAR_HASH_18' + Name of data -> hash || 'a.train_labels.8' -> BAR_HASH_19' + Name of data -> hash || 'a.train_labels.9' -> BAR_HASH_10' + --------------------------------------------------------------------------- + 's.train_images' -> '{"schema_hash": "RM4DefFsjRs=", + "schema_dtype": 2, + "schema_is_var": false, + "schema_max_shape": [784], + "schema_is_named": true}' + 's.train_labels' -> '{"schema_hash": + "ncbHqE6Xldg=", + "schema_dtype": 7, + "schema_is_var": false, + "schema_max_shape": [1], + "schema_is_named": true}' + +### History is Relative + +Though it may be a bit obvious to state, it is of critical importance to +realize that it is only because we store the full contents of the +repository staging area as it existed in the instant just prior to a +commit, that the integrity of full repository history can be verified +from a single commit's contents and expected hash value. More so, any +single commit has only a topical relationship to a commit at any other +point in time. It is only our imposition of a commit's ancestry tree +which actualizes any subsequent insights or interactivity + +While the general process of topological ordering: create branch, +checkout branch, commit a few times, and merge, follows the +[git]{.title-ref} model fairly well at a conceptual level, there are +some important differences we want to highlight due to their +implementation differences: + +1) Multiple commits can simultaneously checked out in "read-only" + mode on a single machine. Checking out a commit for reading does not + touch the staging area status. +2) Only one process can interact with the a write-enabled checkout at a + time. +3) A detached head CANNOT exist for write enabled checkouts. A staging + area must begin with an identical state to the most recent commit of + a/any branch. +4) A staging area which has had changes made in it cannot switch base + branch without either a commit, hard-reset, or (soon to be + developed) stash operation. + +When a repository is initialized, a record is created which indicates +the staging area's [HEAD]{.title-ref} branch. in addition, a branch is +created with the name [master]{.title-ref}, and which is the only commit +in the entire repository which will have no parent. The record key/value +pairs resemble the following: + + 'branch.master' -> '' # No parent commit. + 'head' -> 'branch.master' # Staging area head branch + + # Commit Hash | Parent Commit + ------------------------------------- + +!!! warning + + Much like git, odd things can happen before the ['initial + commit']{.title-ref} is made. We recommend creating the initial commit + as quickly as possible to prevent undefined behavior during repository + setup. In the future, we may decide to create the "initial commit" + automatically upon repository initialization. + +Once the initial commit is made, a permanent commit record in made which +specifies the records (not shown below) and the parent commit. The +branch head pointer is then updated to point to that commit as it's +base. + + 'branch.master' -> '479b4cfff6219e3d' + 'head' -> 'branch.master' + + # Commit Hash | Parent Commit + ------------------------------------- + '479b4cfff6219e3d' -> '' + +Branches can be created as cheaply as a single line of text can be +written, and they simply require a "root" commit hash (or a branch +name, in which case the branch's current HEAD commit will be used as +the root HEAD). Likewise a branch can be merged with just a single write +operation (once the merge logic has completed - a process which is +explained separately from this section; just trust that it happens for +now). + +A more complex example which creates 4 different branches and merges +them in a complicated order can be seen below. Please note that the `<<` +symbol is used to indicate a merge commit where [X \<\< Y]{.title-ref} +reads: `'merging dev branch Y into master branch X'`. + + 'branch.large_branch' -> '8eabd22a51c5818c' + 'branch.master' -> '2cd30b98d34f28f0' + 'branch.test_branch' -> '1241a36e89201f88' + 'branch.trydelete' -> '51bec9f355627596' + 'head' -> 'branch.master' + + # Commit Hash | Parent Commit + ------------------------------------- + '1241a36e89201f88' -> '8a6004f205fd7169' + '2cd30b98d34f28f0' -> '9ec29571d67fa95f << 51bec9f355627596' + '51bec9f355627596' -> 'd683cbeded0c8a89' + '69a09d87ea946f43' -> 'd683cbeded0c8a89' + '8a6004f205fd7169' -> 'a320ae935fc3b91b' + '8eabd22a51c5818c' -> 'c1d596ed78f95f8f' + '9ec29571d67fa95f' -> '69a09d87ea946f43 << 8eabd22a51c5818c' + 'a320ae935fc3b91b' -> 'e3e79dd897c3b120' + 'c1d596ed78f95f8f' -> '' + 'd683cbeded0c8a89' -> 'fe0bcc6a427d5950 << 1241a36e89201f88' + 'e3e79dd897c3b120' -> 'c1d596ed78f95f8f' + 'fe0bcc6a427d5950' -> 'e3e79dd897c3b120' + +Because the raw commit hash logs can be quite dense to parse, a +graphical logging utility is included as part of the repository. Running +the `Repository.log()` method will pretty print a graph representation +of the commit history: + +``` {.sourceCode .python} +>>> from hangar import Repository +>>> repo = Repository(path='/foo/bar/path/') + +... # make some commits + +>>> repo.log() +``` + +![image](./img/repo_graph_log.png) diff --git a/docs/design.rst b/docs/design.rst deleted file mode 100644 index 8bf5dfc7..00000000 --- a/docs/design.rst +++ /dev/null @@ -1,317 +0,0 @@ -.. _ref-hangar-under-the-hood: - -===================== -Hangar Under The Hood -===================== - -At its core, Hangar is a content addressable data store whose design -requirements were inspired by the Git version control system. - - -Things In Life Change, Your Data Shouldn't -========================================== - -When designing a high performance data version control system, achieving -performance goals while ensuring consistency is incredibly difficult. Memory is -fast, disk is slow; not much we can do about it. But since Hangar should -deal with any numeric data in an array of any size (with an enforced limit of -31 dimensions in a sample...) we have to find ways to work *with* the disk, -not against it. - -Upon coming to terms with this face, we are actually presented with a problem -once we realize that we live in the real world, and real world is ugly. -Computers crash, processes get killed, and people do * *interesting* * things. -Because of this, It is a foundational design principle for us to **guarantee -that once Hangar says data has been successfully added to the repository, it is -actually persisted.** This essentially means that any process which interacts -with data records on disk must be stateless. If (for example) we were to keep a -record of all data added to the staging area in an in-memory list, and the -process gets killed, we may have just lost references to all of the array data, -and may not even be sure that the arrays were flushed to disk properly. These -situations are a NO-GO from the start, and will always remain so. - -So, we come to the first design choice: **read and write actions are atomic**. -Once data is added to a Hangar repository, the numeric array along with the -necessary book-keeping records will *always* occur transactionally, ensuring -that when something unexpected happens, the data and records are committed to -disk. - -.. note:: - - The atomicity of interactions is completely hidden from a normal user; they - shouldn't have to care about this or even know this exists. However, this - is also why using the context-manager style column interaction scheme can - result in ~2x times speedup on writes/reads. We can just pass on most of the - work to the Python ``contextlib`` package instead of having to begin and - commit/abort (depending on interaction mode) transactions with every call to - an `add` or `get` method. - - -Data Is Large, We Don't Waste Space -=================================== - -From the very beginning we knew that while it would be easy to just store all -data in every commit as independent arrays on disk, such a naive implementation -would just absolutely eat up disk space for any repository with a non-trivial -history. Hangar commits should be fast and use minimal disk space, duplicating -data just doesn't make sense for such a system. And so we decided on -implementing a content addressable data store backend. - -When a user requests to add data to a Hangar repository, one of the first -operations which occur is to generate a hash of the array contents. If the hash -does not match a piece of data already placed in the Hangar repository, the -data is sent to the appropriate storage backend methods. On success, the -backend sends back some arbitrary specification which can be used to retrieve -that same piece of data from that particular backend. The record backend then -stores a key/value pair of (`hash`, `backend_specification`). - -.. note:: - - The record backend stores hash information in a separate location from the - commit references (which associate a `(columnname, sample name/id)` to a - `sample_hash`). This let's us separate the historical repository - information from a particular computer's location of a data piece. All we need in - the public history is to know that some data with a particular hash is - associated with a commit. No one but the system which actually needs to access - the data needs to know where it can be found. - -On the other hand, if a data sample is added to a repository which already has -a record of some hash, we don't even involve the storage backend. All we need -to do is just record that a new sample in a column was added with that hash. -It makes no sense to write the same data twice. - -This method can actually result in massive space savings for some common use -cases. For the MNIST column, the training label data is typically a 1D-array -of size 50,000. Because there are only 10 labels, we only need to store 10 ints -on disk, and just keep references to the rest. - - -The Basics of Collaboration: Branching and Merging -================================================== - -Up to this point, we haven't actually discussed much about how data and records -are treated on disk. We'll leave an entire walkthrough of the backend record -structure for another tutorial, but let's introduce the basics here, and see -how we enable the types of branching and merging operations you might be used -to with source code (at largely the same speed!). - -Here's a few core principles to keep in mind: - -Numbers == Numbers ------------------- - -Hangar has no concept of what a piece of data is outside of a string of bytes / -numerical array, and most importantly, *hangar does not care*; Hangar is a -tool, and we leave it up to you to know what your data actually means)! - -At the end of the day when the data is placed into *some* collection on disk, -the storage backend we use won't care either. In fact, this is the entire -reason why Hangar can do what it can; we don't attempt to treat data as -anything other then a series of bytes on disk! - -The fact that *Hangar does not care about what your data represents* is a -fundamental underpinning of how the system works under the hood. It is the -*designed and intended behavior* of Hangar to dump arrays to disk in what would -seem like completely arbitrary buffers/locations to an outside observer. And -for the most part, they would be essentially correct in their observation that -data samples on disk are in strange locations. - -While there is almost no organization or hierarchy for the actual data samples -when they are stored on disk, that is not to say that they are stored without -care! We may not care about global trends, but we do care a great deal about -the byte order/layout, sequentiality, chunking/compression and validations -operations which are applied across the bytes which make up a data sample. - -In other words, we optimize for utility and performance on the backend, not so -that a human can understand the file format without a computer! After the array -has been saved to disk, all we care about is that bookkeeper can record some -unique information about where some piece of content is, and how we can read -it. **None of that information is stored alongside the data itself - Remember: -numbers are just numbers - they don't have any concept of what they are**. - - -Records != Numbers ------------------- - -*The form numerical data takes once dumped on disk is completely irrelevant to -the specifications of records in the repository history.* - -Now, let's unpack this for a bit. We know from `Numbers == Numbers`_ that data -is saved to disk in some arbitrary locations with some arbitrary backend. We -also know from `Data Is Large, We Don't Waste Space`_ that the permanent -repository information only contains a record which links a sample name to a -hash. We also assert that there is also a mapping of hash to storage backend -specification kept somewhere (doesn't matter what that mapping is for the -moment). With those 3 pieces of information, it's obvious that once data is -placed in the repository, we don't actually need to interact with it to -understand the accounting of what was added when! - -In order to make a commit, we just pack up all the records which existed in the -staging area, create a hash of the records (including the hash of any parent -commits), and then store the commit hash mapping alongside details such as the -commit user/email and commit message, and a compressed version of the full -commit records as they existed at that point in time. - -.. note:: - - That last point "storing a compressed version of the full commit records", is - semi inefficient, and will be changed in the future so that unchanged records - are note duplicated across commits. - -An example is given below of the keys -> values mapping which stores each of -the staged records, and which are packed up / compressed on commit (and -subsequently unpacked on checkout!). - -:: - - Num asets 'a.' -> '2' - --------------------------------------------------------------------------- - Name of aset -> num samples || 'a.train_images' -> '10' - Name of data -> hash || 'a.train_images.0' -> BAR_HASH_1' - Name of data -> hash || 'a.train_images.1' -> BAR_HASH_2' - Name of data -> hash || 'a.train_images.2' -> BAR_HASH_3' - Name of data -> hash || 'a.train_images.3' -> BAR_HASH_4' - Name of data -> hash || 'a.train_images.4' -> BAR_HASH_5' - Name of data -> hash || 'a.train_images.5' -> BAR_HASH_6' - Name of data -> hash || 'a.train_images.6' -> BAR_HASH_7' - Name of data -> hash || 'a.train_images.7' -> BAR_HASH_8' - Name of data -> hash || 'a.train_images.8' -> BAR_HASH_9' - Name of data -> hash || 'a.train_images.9' -> BAR_HASH_0' - --------------------------------------------------------------------------- - Name of aset -> num samples || 'a.train_labels' -> '10' - Name of data -> hash || 'a.train_labels.0' -> BAR_HASH_11' - Name of data -> hash || 'a.train_labels.1' -> BAR_HASH_12' - Name of data -> hash || 'a.train_labels.2' -> BAR_HASH_13' - Name of data -> hash || 'a.train_labels.3' -> BAR_HASH_14' - Name of data -> hash || 'a.train_labels.4' -> BAR_HASH_15' - Name of data -> hash || 'a.train_labels.5' -> BAR_HASH_16' - Name of data -> hash || 'a.train_labels.6' -> BAR_HASH_17' - Name of data -> hash || 'a.train_labels.7' -> BAR_HASH_18' - Name of data -> hash || 'a.train_labels.8' -> BAR_HASH_19' - Name of data -> hash || 'a.train_labels.9' -> BAR_HASH_10' - --------------------------------------------------------------------------- - 's.train_images' -> '{"schema_hash": "RM4DefFsjRs=", - "schema_dtype": 2, - "schema_is_var": false, - "schema_max_shape": [784], - "schema_is_named": true}' - 's.train_labels' -> '{"schema_hash": - "ncbHqE6Xldg=", - "schema_dtype": 7, - "schema_is_var": false, - "schema_max_shape": [1], - "schema_is_named": true}' - -History is Relative -------------------- - -Though it may be a bit obvious to state, it is of critical importance to -realize that it is only because we store the full contents of the repository -staging area as it existed in the instant just prior to a commit, that the -integrity of full repository history can be verified from a single commit's -contents and expected hash value. More so, any single commit has only a topical -relationship to a commit at any other point in time. It is only our imposition -of a commit's ancestry tree which actualizes any subsequent insights or -interactivity - -While the general process of topological ordering: create branch, checkout -branch, commit a few times, and merge, follows the `git` model fairly well at a -conceptual level, there are some important -differences we want to highlight due to their implementation differences: - -1) Multiple commits can simultaneously checked out in "read-only" mode on a - single machine. Checking out a commit for reading does not touch the staging - area status. -2) Only one process can interact with the a write-enabled checkout at a time. -3) A detached head CANNOT exist for write enabled checkouts. A staging area must - begin with an identical state to the most recent commit of a/any branch. -4) A staging area which has had changes made in it cannot switch base branch - without either a commit, hard-reset, or (soon to be developed) stash - operation. - -When a repository is initialized, a record is created which indicates the -staging area's `HEAD` branch. in addition, a branch is created with the name -`master`, and which is the only commit in the entire repository which will have -no parent. The record key/value pairs resemble the following: - -:: - - 'branch.master' -> '' # No parent commit. - 'head' -> 'branch.master' # Staging area head branch - - # Commit Hash | Parent Commit - ------------------------------------- - - -.. warning:: - - Much like git, odd things can happen before the `'initial commit'` is made. We - recommend creating the initial commit as quickly as possible to prevent - undefined behavior during repository setup. In the future, we may decide to - create the "initial commit" automatically upon repository initialization. - - -Once the initial commit is made, a permanent commit record in made which -specifies the records (not shown below) and the parent commit. The branch head -pointer is then updated to point to that commit as it's base. - -:: - - 'branch.master' -> '479b4cfff6219e3d' - 'head' -> 'branch.master' - - # Commit Hash | Parent Commit - ------------------------------------- - '479b4cfff6219e3d' -> '' - -Branches can be created as cheaply as a single line of text can be written, and -they simply require a "root" commit hash (or a branch name, in which case the -branch's current HEAD commit will be used as the root HEAD). Likewise a branch -can be merged with just a single write operation (once the merge logic has -completed - a process which is explained separately from this section; just -trust that it happens for now). - -A more complex example which creates 4 different branches and merges them in a -complicated order can be seen below. Please note that the `` << `` symbol is -used to indicate a merge commit where `X << Y` reads: ``'merging dev branch Y -into master branch X'``. - -:: - - 'branch.large_branch' -> '8eabd22a51c5818c' - 'branch.master' -> '2cd30b98d34f28f0' - 'branch.test_branch' -> '1241a36e89201f88' - 'branch.trydelete' -> '51bec9f355627596' - 'head' -> 'branch.master' - - # Commit Hash | Parent Commit - ------------------------------------- - '1241a36e89201f88' -> '8a6004f205fd7169' - '2cd30b98d34f28f0' -> '9ec29571d67fa95f << 51bec9f355627596' - '51bec9f355627596' -> 'd683cbeded0c8a89' - '69a09d87ea946f43' -> 'd683cbeded0c8a89' - '8a6004f205fd7169' -> 'a320ae935fc3b91b' - '8eabd22a51c5818c' -> 'c1d596ed78f95f8f' - '9ec29571d67fa95f' -> '69a09d87ea946f43 << 8eabd22a51c5818c' - 'a320ae935fc3b91b' -> 'e3e79dd897c3b120' - 'c1d596ed78f95f8f' -> '' - 'd683cbeded0c8a89' -> 'fe0bcc6a427d5950 << 1241a36e89201f88' - 'e3e79dd897c3b120' -> 'c1d596ed78f95f8f' - 'fe0bcc6a427d5950' -> 'e3e79dd897c3b120' - - -Because the raw commit hash logs can be quite dense to parse, a graphical -logging utility is included as part of the repository. Running the -``Repository.log()`` method will pretty print a graph representation of the -commit history: - -.. code:: python - - >>> from hangar import Repository - >>> repo = Repository(path='/foo/bar/path/') - - ... # make some commits - - >>> repo.log() - -.. image:: ./img/repo_graph_log.png diff --git a/docs/externals.md b/docs/externals.md new file mode 100644 index 00000000..8ce4f86a --- /dev/null +++ b/docs/externals.md @@ -0,0 +1,14 @@ +Hangar External +=============== + +High level interaction interface between hangar and everything external. + +High Level Methods +------------------ + +::: hangar.external._external + +Plugin System +------------- + +::: hangar.external.base_plugin diff --git a/docs/externals.rst b/docs/externals.rst deleted file mode 100644 index d5f3f53b..00000000 --- a/docs/externals.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _ref-external: - -=============== -Hangar External -=============== - -High level interaction interface between hangar and everything external. - -High Level Methods -================== -.. automodule:: hangar.external._external - :members: - -Plugin System -============= -.. automodule:: hangar.external.base_plugin - :members: diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 00000000..9e697272 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,221 @@ +Frequently Asked Questions +========================== + +The following documentation are taken from questions and comments on the +[Hangar User Group Slack Channel](https://hangarusergroup.slack.com) and +over various Github issues. + +How can I get an Invite to the Hangar User Group? +------------------------------------------------- + +Just click on [This Signup Link +\]() +to get started. + +Data Integrity +-------------- + +> Being a young project did you encounter some situations where the +> disaster was not a compilation error but dataset corruption? This is +> the most fearing aspect of using young projects but every project will +> start from a phase before becoming mature and production ready. + +An absolute requirement of a system right this is to protect user data +at all costs (I'll refer to this as preserving data \"integrity\" from +here). During our initial design of the system, we made the decision +that preserving integrity comes above all other system parameters: +including performance, disk size, complexity of the Hangar core, and +even features should we not be able to make them absolutely safe for the +user. And to be honest, the very first versions of Hangar were quite +slow and difficult to use as a result of this. + +The initial versions of Hangar (which we put together in \~2 weeks) had +essentially most of the features we have today. We've improved the API, +made things clearer, and added some visualization/reporting utilities, +but not much has changed. Essentially the entire development effort has +been addressing issues stemming from a fundamental need to protect user +data at all costs. That work has been very successful, and performance +is extremely promising (and improving all the time). + +To get into the details here: There have been only 3 instances in the +entire time I've developed Hangar where we lost data irrecoverably: + +1. We used to move data around between folders with some regularity (as + a convenient way to mark some files as containing data which have + been "committed", and can no longer be opened in anything but + read-only mode). There was a bug (which never made it past a local + dev version) at one point where I accidentally called + `shutil.rmtree(path)` with a directory one level too high... that + wasn't great. + + Just to be clear, we don't do this anymore (since disk IO costs are + way too high), but remnants of it's intention are still very much + alive and well. Once data has been added to the repository, and is + "committed", the file containing that data will never be opened in + anything but read-only mode again. This reduces the chance of disk + corruption massively from the start. + +------------------------------------------------------------------------ + +2. When I was implementing the numpy memmap array storage backend, I + was totally surprised during an early test when I: + + ``` {.sourceCode .text} + - opened a write-enabled checkout + - added some data + - without committing, retrieved the same data again via the user facing API + - overwrote some slice of the return array with new data and did some processing + - asked Hangar for that same array key again, and instead of returning + the contents got a fatal RuntimeError raised by Hangar with the + code/message indicating "'DATA CORRUPTION ERROR: Checksum {cksum} != + recorded for {hashVal}" + ``` + + What had happened was that when opening a `numpy.memmap` array on + disk in `w+` mode, the default behavior when returning a subarray is + to return a subclass of `np.ndarray` of type `np.memmap`. Though the + numpy docs state: \"The memmap object can be used anywhere an + ndarray is accepted. Given a `memmap fp`, + `isinstance(fp, numpy.ndarray)` returns `True`\". I did not + anticipate that updates to the subarray slice would also update the + memmap on disk. A simple mistake to make; this has since been + remedied by manually instantiating a new `np.ndarray` instance from + the `np.memmap` subarray slice buffer. + + However, the nice part is that this was a real world proof that our + system design worked (and not just in tests). When you add data to a + Hangar checkout (or receive it on a fetch/clone operation) we + calculate a hash digest of the data via `blake2b` (a + cryptographically secure algorithm in the python standard library). + While this allows us to cryptographically verify full integrity + checks and history immutability, cryptographic hashes are slow by + design. When we want to read local data (which we've already ensured + was correct when it was placed on disk) it would be prohibitively + slow to do a full cryptographic verification on every read. However, + since its NOT acceptable to provide no integrity verification (even + for local writes) we compromise with a much faster (though non + cryptographic) hash digest/checksum. This operation occurs on EVERY + read of data from disk. + + The theory here is that even though Hangar makes every effort to + guarantee safe operations itself, in the real world we have to deal + with systems which break. We've planned for cases where some OS + induced disk corruption occurs, or where some malicious actor + modifies the file contents manually; we can't stop that from + happening, but Hangar can make sure that you will know about it when + it happens! + +------------------------------------------------------------------------ + +3. Before we got smart with the HDF5 backend low level details, it was + an issue for us to have a write-enabled checkout attempt to write an + array to disk and immediately read it back in. I'll gloss over the + details for the sake of simplicity here, but basically I was + presented with an CRC32 Checksum Verification Failed error in some + edge cases. The interesting bit was that if I closed the checkout, + and reopened it, it data was secure and intact on disk, but for + immediate reads after writes, we weren't propagating changes to the + HDF5 chunk metadata cache to `rw` operations appropriately. + + This was fixed very early on by taking advantage of a new feature in + HDF5 1.10.4 referred to as Single Writer Multiple Reader (SWMR). The + long and short is that by being careful to handle the order in which + a new HDF5 file is created on disk and opened in w and r mode with + SWMR enabled, the HDF5 core guarantees the integrity of the metadata + chunk cache at all times. Even if a fatal system crash occurs in the + middle of a write, the data will be preserved. This solved this + issue completely for us + + There are many many many more details which I could cover here, but + the long and short of it is that in order to ensure data integrity, + Hangar is designed to not let the user do anything they aren't + allowed to at any time + + > - Read checkouts have no ability to modify contents on disk via + > any method. It's not possible for them to actually delete or + > overwrite anything in any way. + > - Write checkouts can only ever write data. The only way to + > remove the actual contents of written data from disk is if + > changes have been made in the staging area (but not committed) + > and the `reset_staging_area()` method is called. And even this + > has no ability to remove any data which had previously existed + > in some commit in the repo's history + + In addition, a Hangar checkout object is not what it appears to be + (at first glance, use, or even during common introspection + operations). If you try to operate on it after closing the checkout, + or holding it while another checkout is started, you won't be able + to (there's a whole lot of invisible "magic" going on with + `weakrefs`, `objectproxies`, and instance attributes). I would + encourage you to do the following: + + ``` {.sourceCode .pycon} + >>> co = repo.checkout(write=True) + >>> co.metadata['hello'] = 'world' + >>> # try to hold a reference to the metadata object: + >>> mRef = co.metadata + >>> mRef['hello'] + 'world' + >>> co.commit('first commit') + >>> co.close() + >>> # what happens when you try to access the `co` or `mRef` object? + >>> mRef['hello'] + ReferenceError: weakly-referenced object no longer exists + >>> print(co) # or any other operation + PermissionError: Unable to operate on past checkout objects which have been closed. No operation occurred. Please use a new checkout. + ``` + + The last bit I'll leave you with is a note on context managers and + performance (how we handle record data safety and effectively + + +How Can a Hangar Repository be Backed Up? +----------------------------------------- + +Two strategies exist: + +1. Use a remote server and Hangar's built in ability to just push data + to a remote! (tutorial coming soon, see `ref-api`{.interpreted-text + role="ref"} for more details. +2. A Hangar repository is self contained in it's .hangar directory. To + back up the data, just copy/paste or rsync it to another machine! + (edited) + +On Determining `Column` Schema Sizes +------------------------------------ + +> Say I have a data group that specifies a data array with one +> dimension, three elements (say height, width, num channels) and later +> on I want to add bit depth. Can I do that, or do I need to make a new +> data group? Should it have been three scalar data groups from the +> start? + +So right now it's not possible to change the schema (shape, dtype) of a +column. I've thought about such a feature for a while now, and while it +will require a new user facing API option, its (almost) trivial to make +it work in the core. It just hasn't seemed like a priority yet\... + +And no, I wouldn't specify each of those as scalar data groups, they are +a related piece of information, and generally would want to be accessed +together + +Access patterns should generally dictate how much info is placed in a +column + +### Is there a performance/space penalty for having lots of small data groups? + +As far as a performance / space penalty, this is where it gets good :) + +- Using fewer columns means that there are fewer records (the internal + locating info, kind-of like a git tree) to store, since each record + points to a sample containing more information. +- Using more columns means that the likelihood of samples having the + same value increases, meaning fewer pieces of data are actually + stored on disk (remember it's a content addressable file store) + +However, since the size of a record (40 bytes or so before compression, +and we generally see compression ratios around 15-30% of the original +size once the records are committed) is generally negligible compared to +the size of data on disk, optimizing for number of records is just way +overkill. For this case, it really doesn't matter. **Optimize for ease +of use** diff --git a/docs/faq.rst b/docs/faq.rst deleted file mode 100644 index f25d12a6..00000000 --- a/docs/faq.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. _ref-faq: - -========================== -Frequently Asked Questions -========================== - -The following documentation are taken from questions and comments on the -`Hangar User Group Slack Channel `_ -and over various Github issues. - - -How can I get an Invite to the Hangar User Group? -================================================== - -Just click on `This Signup Link -`_ -to get started. - - -Data Integrity -============== - - Being a young project did you encounter some situations where the disaster - was not a compilation error but dataset corruption? This is the most fearing - aspect of using young projects but every project will start from a phase - before becoming mature and production ready. - -An absolute requirement of a system right this is to protect user data at all -costs (I’ll refer to this as preserving data "integrity" from here). During our -initial design of the system, we made the decision that preserving integrity -comes above all other system parameters: including performance, disk size, -complexity of the Hangar core, and even features should we not be able to make -them absolutely safe for the user. And to be honest, the very first versions of -Hangar were quite slow and difficult to use as a result of this. - -The initial versions of Hangar (which we put together in ~2 weeks) had -essentially most of the features we have today. We’ve improved the API, made -things clearer, and added some visualization/reporting utilities, but not much -has changed. Essentially the entire development effort has been addressing -issues stemming from a fundamental need to protect user data at all costs. That -work has been very successful, and performance is extremely promising (and -improving all the time). - -To get into the details here: There have been only 3 instances in the entire -time I’ve developed Hangar where we lost data irrecoverably: - -1. We used to move data around between folders with some regularity (as a - convenient way to mark some files as containing data which have been - “committed”, and can no longer be opened in anything but read-only mode). - There was a bug (which never made it past a local dev version) at one point - where I accidentally called ``shutil.rmtree(path)`` with a directory one - level too high… that wasn’t great. - - Just to be clear, we don’t do this anymore (since disk IO costs are way too - high), but remnants of it’s intention are still very much alive and well. - Once data has been added to the repository, and is “committed”, the file - containing that data will never be opened in anything but read-only mode - again. This reduces the chance of disk corruption massively from the start. - ----- - -2. When I was implementing the numpy memmap array storage backend, I was - totally surprised during an early test when I: - - .. code:: text - - - opened a write-enabled checkout - - added some data - - without committing, retrieved the same data again via the user facing API - - overwrote some slice of the return array with new data and did some processing - - asked Hangar for that same array key again, and instead of returning - the contents got a fatal RuntimeError raised by Hangar with the - code/message indicating "'DATA CORRUPTION ERROR: Checksum {cksum} != - recorded for {hashVal}" - - What had happened was that when opening a ``numpy.memmap`` array on disk in - ``w+`` mode, the default behavior when returning a subarray is to return a - subclass of ``np.ndarray`` of type ``np.memmap``. Though the numpy docs - state: "The memmap object can be used anywhere an ndarray is accepted. Given - a ``memmap fp``, ``isinstance(fp, numpy.ndarray)`` returns ``True``". I did - not anticipate that updates to the subarray slice would also update the - memmap on disk. A simple mistake to make; this has since been remedied by - manually instantiating a new ``np.ndarray`` instance from the ``np.memmap`` - subarray slice buffer. - - However, the nice part is that this was a real world proof that our system - design worked (and not just in tests). When you add data to a Hangar - checkout (or receive it on a fetch/clone operation) we calculate a hash - digest of the data via ``blake2b`` (a cryptographically secure algorithm in the - python standard library). While this allows us to cryptographically verify full - integrity checks and history immutability, cryptographic hashes are slow by - design. When we want to read local data (which we’ve already ensured was - correct when it was placed on disk) it would be prohibitively slow to do a - full cryptographic verification on every read. However, since its NOT - acceptable to provide no integrity verification (even for local writes) we - compromise with a much faster (though non cryptographic) hash - digest/checksum. This operation occurs on EVERY read of data from disk. - - The theory here is that even though Hangar makes every effort to guarantee - safe operations itself, in the real world we have to deal with systems which - break. We’ve planned for cases where some OS induced disk corruption occurs, - or where some malicious actor modifies the file contents manually; we can’t - stop that from happening, but Hangar can make sure that you will know about - it when it happens! - ----- - -3. Before we got smart with the HDF5 backend low level details, it was an issue - for us to have a write-enabled checkout attempt to write an array to disk - and immediately read it back in. I’ll gloss over the details for the sake of - simplicity here, but basically I was presented with an CRC32 Checksum - Verification Failed error in some edge cases. The interesting bit was that - if I closed the checkout, and reopened it, it data was secure and intact on - disk, but for immediate reads after writes, we weren’t propagating changes - to the HDF5 chunk metadata cache to ``rw`` operations appropriately. - - This was fixed very early on by taking advantage of a new feature in HDF5 - 1.10.4 referred to as Single Writer Multiple Reader (SWMR). The long and - short is that by being careful to handle the order in which a new HDF5 file - is created on disk and opened in w and r mode with SWMR enabled, the HDF5 - core guarantees the integrity of the metadata chunk cache at all times. Even - if a fatal system crash occurs in the middle of a write, the data will be - preserved. This solved this issue completely for us - - There are many many many more details which I could cover here, but the long - and short of it is that in order to ensure data integrity, Hangar is - designed to not let the user do anything they aren’t allowed to at any time - - - Read checkouts have no ability to modify contents on disk via any - method. It’s not possible for them to actually delete or overwrite - anything in any way. - - Write checkouts can only ever write data. The only way to remove the - actual contents of written data from disk is if changes have been made - in the staging area (but not committed) and the - ``reset_staging_area()`` method is called. And even this has no - ability to remove any data which had previously existed in some commit - in the repo’s history - - In addition, a Hangar checkout object is not what it appears to be (at first - glance, use, or even during common introspection operations). If you try to - operate on it after closing the checkout, or holding it while another - checkout is started, you won’t be able to (there’s a whole lot of invisible - “magic” going on with ``weakrefs``, ``objectproxies``, and instance - attributes). I would encourage you to do the following: - - .. code:: pycon - - >>> co = repo.checkout(write=True) - >>> co.metadata['hello'] = 'world' - >>> # try to hold a reference to the metadata object: - >>> mRef = co.metadata - >>> mRef['hello'] - 'world' - >>> co.commit('first commit') - >>> co.close() - >>> # what happens when you try to access the `co` or `mRef` object? - >>> mRef['hello'] - ReferenceError: weakly-referenced object no longer exists - >>> print(co) # or any other operation - PermissionError: Unable to operate on past checkout objects which have been closed. No operation occurred. Please use a new checkout. - - The last bit I’ll leave you with is a note on context managers and performance - (how we handle record data safety and effectively - - .. seealso:: - - - :ref:`ref-tutorial` (Part 1, In section: "performance") - - :ref:`ref-hangar-under-the-hood` - - -How Can a Hangar Repository be Backed Up? -========================================= - -Two strategies exist: - -1. Use a remote server and Hangar’s built in ability to just push data to a - remote! (tutorial coming soon, see :ref:`ref-api` for more details. - -2. A Hangar repository is self contained in it’s .hangar directory. To back - up the data, just copy/paste or rsync it to another machine! (edited) - - -On Determining ``Column`` Schema Sizes -======================================= - - Say I have a data group that specifies a data array with one dimension, - three elements (say height, width, num channels) and later on I want to add - bit depth. Can I do that, or do I need to make a new data group? Should it - have been three scalar data groups from the start? - -So right now it’s not possible to change the schema (shape, dtype) of a -column. I’ve thought about such a feature for a while now, and while it will -require a new user facing API option, its (almost) trivial to make it work in -the core. It just hasn’t seemed like a priority yet... - -And no, I wouldn’t specify each of those as scalar data groups, they are a -related piece of information, and generally would want to be accessed together - -Access patterns should generally dictate how much info is placed in a column - - -Is there a performance/space penalty for having lots of small data groups? --------------------------------------------------------------------------- - -As far as a performance / space penalty, this is where it gets good :) - -- Using fewer columns means that there are fewer records (the internal - locating info, kind-of like a git tree) to store, since each record points to - a sample containing more information. - -- Using more columns means that the likelihood of samples having the same - value increases, meaning fewer pieces of data are actually stored on disk - (remember it’s a content addressable file store) - -However, since the size of a record (40 bytes or so before compression, and we -generally see compression ratios around 15-30% of the original size once the -records are committed) is generally negligible compared to the size of data on -disk, optimizing for number of records is just way overkill. For this case, it -really doesn’t matter. **Optimize for ease of use** diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index e9bd370c..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. include:: ../README.rst - -.. toctree:: - :maxdepth: 3 - - readme - quickstart - installation - concepts - api - tutorial - design - cli - externals - faq - backends - contributingindex - authors - changelog - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..a39610b9 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,56 @@ +Installation +============ + +For general usage it is recommended that you use a pre-built version of +Hangar, either from a Python Distribution, or a pre-built wheel from +PyPi. + +Pre-Built Installation +---------------------- + +### Python Distributions + +If you do not already use a Python Distribution, we recommend the +[Anaconda \]() (or [Miniconda +\]()) distribution, +which supports all major operating systems (Windows, MacOSX, & the +typical Linux variations). Detailed usage instructions are available [on +the anaconda website \](). + +To install Hangar via the Anaconda Distribution (from the [conda-forge +conda channel \]()): + + conda install -c conda-forge hangar + +### Wheels (PyPi) + +If you have an existing python installation on your computer, pre-built +Hangar Wheels can be installed via pip from the Python Package Index +(PyPi): + + pip install hangar + +Source Installation +------------------- + +To install Hangar from source, clone the repository from [Github +\](): + + git clone https://github.com/tensorwerk/hangar-py.git + cd hangar-py + python setup.py install + +Or use pip on the local package if you want to install all dependencies +automatically in a development environment: + + pip install -e . + +### Source installation in Google colab + +Google colab comes with an older version of `h5py` pre-installed which +is not compatible with hangar. If you need to install hangar from the +source in google colab, make sure to uninstall the existing `h5py` : + + !pip uninstall h5py + +Then follow the Source Installation steps given above. diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index 380dcba1..00000000 --- a/docs/installation.rst +++ /dev/null @@ -1,65 +0,0 @@ -.. _ref_installation: - -============ -Installation -============ - -For general usage it is recommended that you use a pre-built version of Hangar, -either from a Python Distribution, or a pre-built wheel from PyPi. - - -Pre-Built Installation -====================== - - -Python Distributions --------------------- - -If you do not already use a Python Distribution, we recommend the `Anaconda -`_ (or `Miniconda -`_) distribution, which supports -all major operating systems (Windows, MacOSX, & the typical Linux variations). -Detailed usage instructions are available `on the anaconda website -`_. - -To install Hangar via the Anaconda Distribution (from the `conda-forge conda -channel `_):: - - conda install -c conda-forge hangar - - -Wheels (PyPi) -------------- - -If you have an existing python installation on your computer, pre-built Hangar Wheels -can be installed via pip from the Python Package Index (PyPi):: - - pip install hangar - - -Source Installation -=================== - - -To install Hangar from source, clone the repository from `Github -`_:: - - git clone https://github.com/tensorwerk/hangar-py.git - cd hangar-py - python setup.py install - -Or use pip on the local package if you want to install all dependencies -automatically in a development environment:: - - pip install -e . - - -Source installation in Google colab ------------------------------------ -Google colab comes with an older version of ``h5py`` pre-installed which is not -compatible with hangar. If you need to install hangar from the source in -google colab, make sure to uninstall the existing ``h5py`` :: - - !pip uninstall h5py - -Then follow the Source Installation steps given above. diff --git a/docs/js/custom.js b/docs/js/custom.js new file mode 100644 index 00000000..fe2e09dc --- /dev/null +++ b/docs/js/custom.js @@ -0,0 +1,114 @@ +/** + * Code from fastapi github repository + */ + +function setupTermynal() { + document.querySelectorAll(".use-termynal").forEach(node => { + node.style.display = "block"; + new Termynal(node, { + lineDelay: 500 + }); + }); + const progressLiteralStart = "---> 100%"; + const promptLiteralStart = "$ "; + const customPromptLiteralStart = "# "; + const termynalActivateClass = "termy"; + let termynals = []; + + function createTermynals() { + document + .querySelectorAll(`.${termynalActivateClass}`) + .forEach(node => { + const text = node.textContent; + const lines = text.split("\n"); + const useLines = []; + let buffer = []; + function saveBuffer() { + if (buffer.length) { + let isBlankSpace = true; + buffer.forEach(line => { + if (line) { + isBlankSpace = false; + } + }); + dataValue = {}; + if (isBlankSpace) { + dataValue["delay"] = 0; + } + if (buffer[buffer.length - 1] === "") { + // A last single
won't have effect + // so put an additional one + buffer.push(""); + } + const bufferValue = buffer.join("
"); + dataValue["value"] = bufferValue; + useLines.push(dataValue); + buffer = []; + } + } + for (let line of lines) { + if (line === progressLiteralStart) { + saveBuffer(); + useLines.push({ + type: "progress" + }); + } else if (line.startsWith(promptLiteralStart)) { + saveBuffer(); + const value = line.replace(promptLiteralStart, "").trimEnd(); + useLines.push({ + type: "input", + value: value + }); + } else if (line.startsWith("// ")) { + saveBuffer(); + const value = "💬 " + line.replace("// ", "").trimEnd(); + useLines.push({ + value: value, + class: "termynal-comment", + delay: 0 + }); + } else if (line.startsWith(customPromptLiteralStart)) { + saveBuffer(); + const promptStart = line.indexOf(promptLiteralStart); + if (promptStart === -1) { + console.error("Custom prompt found but no end delimiter", line) + } + const prompt = line.slice(0, promptStart).replace(customPromptLiteralStart, "") + let value = line.slice(promptStart + promptLiteralStart.length); + useLines.push({ + type: "input", + value: value, + prompt: prompt + }); + } else { + buffer.push(line); + } + } + saveBuffer(); + const div = document.createElement("div"); + node.replaceWith(div); + const termynal = new Termynal(div, { + lineData: useLines, + noInit: true, + lineDelay: 500 + }); + termynals.push(termynal); + }); + } + + function loadVisibleTermynals() { + termynals = termynals.filter(termynal => { + if (termynal.container.getBoundingClientRect().top - innerHeight <= 0) { + termynal.init(); + return false; + } + return true; + }); + } + window.addEventListener("scroll", loadVisibleTermynals); + createTermynals(); + loadVisibleTermynals(); +} + +setupTermynal() +document.getElementsByClassName('gitter-open-chat-button')[0].style.backgroundColor="#7f85c0" diff --git a/docs/js/termynal.js b/docs/js/termynal.js new file mode 100644 index 00000000..4ac32708 --- /dev/null +++ b/docs/js/termynal.js @@ -0,0 +1,264 @@ +/** + * termynal.js + * A lightweight, modern and extensible animated terminal window, using + * async/await. + * + * @author Ines Montani + * @version 0.0.1 + * @license MIT + */ + +'use strict'; + +/** Generate a terminal widget. */ +class Termynal { + /** + * Construct the widget's settings. + * @param {(string|Node)=} container - Query selector or container element. + * @param {Object=} options - Custom settings. + * @param {string} options.prefix - Prefix to use for data attributes. + * @param {number} options.startDelay - Delay before animation, in ms. + * @param {number} options.typeDelay - Delay between each typed character, in ms. + * @param {number} options.lineDelay - Delay between each line, in ms. + * @param {number} options.progressLength - Number of characters displayed as progress bar. + * @param {string} options.progressChar – Character to use for progress bar, defaults to █. + * @param {number} options.progressPercent - Max percent of progress. + * @param {string} options.cursor – Character to use for cursor, defaults to ▋. + * @param {Object[]} lineData - Dynamically loaded line data objects. + * @param {boolean} options.noInit - Don't initialise the animation. + */ + constructor(container = '#termynal', options = {}) { + this.container = (typeof container === 'string') ? document.querySelector(container) : container; + this.pfx = `data-${options.prefix || 'ty'}`; + this.originalStartDelay = this.startDelay = options.startDelay + || parseFloat(this.container.getAttribute(`${this.pfx}-startDelay`)) || 600; + this.originalTypeDelay = this.typeDelay = options.typeDelay + || parseFloat(this.container.getAttribute(`${this.pfx}-typeDelay`)) || 90; + this.originalLineDelay = this.lineDelay = options.lineDelay + || parseFloat(this.container.getAttribute(`${this.pfx}-lineDelay`)) || 1500; + this.progressLength = options.progressLength + || parseFloat(this.container.getAttribute(`${this.pfx}-progressLength`)) || 40; + this.progressChar = options.progressChar + || this.container.getAttribute(`${this.pfx}-progressChar`) || '█'; + this.progressPercent = options.progressPercent + || parseFloat(this.container.getAttribute(`${this.pfx}-progressPercent`)) || 100; + this.cursor = options.cursor + || this.container.getAttribute(`${this.pfx}-cursor`) || '▋'; + this.lineData = this.lineDataToElements(options.lineData || []); + this.loadLines() + if (!options.noInit) this.init() + } + + loadLines() { + // Load all the lines and create the container so that the size is fixed + // Otherwise it would be changing and the user viewport would be constantly + // moving as she/he scrolls + const finish = this.generateFinish() + finish.style.visibility = 'hidden' + this.container.appendChild(finish) + // Appends dynamically loaded lines to existing line elements. + this.lines = [...this.container.querySelectorAll(`[${this.pfx}]`)].concat(this.lineData); + for (let line of this.lines) { + line.style.visibility = 'hidden' + this.container.appendChild(line) + } + const restart = this.generateRestart() + restart.style.visibility = 'hidden' + this.container.appendChild(restart) + this.container.setAttribute('data-termynal', ''); + } + + /** + * Initialise the widget, get lines, clear container and start animation. + */ + init() { + /** + * Calculates width and height of Termynal container. + * If container is empty and lines are dynamically loaded, defaults to browser `auto` or CSS. + */ + const containerStyle = getComputedStyle(this.container); + this.container.style.width = containerStyle.width !== '0px' ? + containerStyle.width : undefined; + this.container.style.minHeight = containerStyle.height !== '0px' ? + containerStyle.height : undefined; + + this.container.setAttribute('data-termynal', ''); + this.container.innerHTML = ''; + for (let line of this.lines) { + line.style.visibility = 'visible' + } + this.start(); + } + + /** + * Start the animation and rener the lines depending on their data attributes. + */ + async start() { + this.addFinish() + await this._wait(this.startDelay); + + for (let line of this.lines) { + const type = line.getAttribute(this.pfx); + const delay = line.getAttribute(`${this.pfx}-delay`) || this.lineDelay; + + if (type == 'input') { + line.setAttribute(`${this.pfx}-cursor`, this.cursor); + await this.type(line); + await this._wait(delay); + } + + else if (type == 'progress') { + await this.progress(line); + await this._wait(delay); + } + + else { + this.container.appendChild(line); + await this._wait(delay); + } + + line.removeAttribute(`${this.pfx}-cursor`); + } + this.addRestart() + this.finishElement.style.visibility = 'hidden' + this.lineDelay = this.originalLineDelay + this.typeDelay = this.originalTypeDelay + this.startDelay = this.originalStartDelay + } + + generateRestart() { + const restart = document.createElement('a') + restart.onclick = (e) => { + e.preventDefault() + this.container.innerHTML = '' + this.init() + } + restart.href = '#' + restart.setAttribute('data-terminal-control', '') + restart.innerHTML = "restart ↻" + return restart + } + + generateFinish() { + const finish = document.createElement('a') + finish.onclick = (e) => { + e.preventDefault() + this.lineDelay = 0 + this.typeDelay = 0 + this.startDelay = 0 + } + finish.href = '#' + finish.setAttribute('data-terminal-control', '') + finish.innerHTML = "fast →" + this.finishElement = finish + return finish + } + + addRestart() { + const restart = this.generateRestart() + this.container.appendChild(restart) + } + + addFinish() { + const finish = this.generateFinish() + this.container.appendChild(finish) + } + + /** + * Animate a typed line. + * @param {Node} line - The line element to render. + */ + async type(line) { + const chars = [...line.textContent]; + line.textContent = ''; + this.container.appendChild(line); + + for (let char of chars) { + const delay = line.getAttribute(`${this.pfx}-typeDelay`) || this.typeDelay; + await this._wait(delay); + line.textContent += char; + } + } + + /** + * Animate a progress bar. + * @param {Node} line - The line element to render. + */ + async progress(line) { + const progressLength = line.getAttribute(`${this.pfx}-progressLength`) + || this.progressLength; + const progressChar = line.getAttribute(`${this.pfx}-progressChar`) + || this.progressChar; + const chars = progressChar.repeat(progressLength); + const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`) + || this.progressPercent; + line.textContent = ''; + this.container.appendChild(line); + + for (let i = 1; i < chars.length + 1; i++) { + await this._wait(this.typeDelay); + const percent = Math.round(i / chars.length * 100); + line.textContent = `${chars.slice(0, i)} ${percent}%`; + if (percent>progressPercent) { + break; + } + } + } + + /** + * Helper function for animation delays, called with `await`. + * @param {number} time - Timeout, in ms. + */ + _wait(time) { + return new Promise(resolve => setTimeout(resolve, time)); + } + + /** + * Converts line data objects into line elements. + * + * @param {Object[]} lineData - Dynamically loaded lines. + * @param {Object} line - Line data object. + * @returns {Element[]} - Array of line elements. + */ + lineDataToElements(lineData) { + return lineData.map(line => { + let div = document.createElement('div'); + div.innerHTML = `${line.value || ''}`; + + return div.firstElementChild; + }); + } + + /** + * Helper function for generating attributes string. + * + * @param {Object} line - Line data object. + * @returns {string} - String of attributes. + */ + _attributes(line) { + let attrs = ''; + for (let prop in line) { + // Custom add class + if (prop === 'class') { + attrs += ` class=${line[prop]} ` + continue + } + if (prop === 'type') { + attrs += `${this.pfx}="${line[prop]}" ` + } else if (prop !== 'value') { + attrs += `${this.pfx}-${prop}="${line[prop]}" ` + } + } + + return attrs; + } +} + +/** +* HTML API: If current script has container(s) specified, initialise Termynal. +*/ +if (document.currentScript.hasAttribute('data-termynal-container')) { + const containers = document.currentScript.getAttribute('data-termynal-container'); + containers.split('|') + .forEach(container => new Termynal(container)) +} diff --git a/docs/noindexapi/apiinit.rst b/docs/noindexapi/apiinit.rst deleted file mode 100644 index 57f08113..00000000 --- a/docs/noindexapi/apiinit.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. automethod:: hangar.checkout.WriterCheckout.add_ndarray_column - :noindex: - -.. automethod:: hangar.checkout.WriterCheckout.add_str_column - :noindex: - -.. automethod:: hangar.checkout.WriterCheckout.add_bytes_column - :noindex: diff --git a/docs/noindexapi/apiremotefetchdata.rst b/docs/noindexapi/apiremotefetchdata.rst deleted file mode 100644 index 4563b206..00000000 --- a/docs/noindexapi/apiremotefetchdata.rst +++ /dev/null @@ -1,2 +0,0 @@ -.. automethod:: hangar.repository.Remotes.fetch_data - :noindex: \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 00000000..0d3912dc --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,10 @@ +Usage +===== + +To use Hangar in a project: + + from hangar import Repository + +Please refer to the `ref-tutorial`{.interpreted-text role="ref"} for +examples, or `ref-concepts`{.interpreted-text role="ref"} to review the +core concepts of the Hangar system. diff --git a/docs/quickstart.rst b/docs/quickstart.rst deleted file mode 100644 index f133d0d4..00000000 --- a/docs/quickstart.rst +++ /dev/null @@ -1,11 +0,0 @@ -===== -Usage -===== - -To use Hangar in a project:: - - from hangar import Repository - - -Please refer to the :ref:`ref-tutorial` for examples, or :ref:`ref-concepts` to -review the core concepts of the Hangar system. \ No newline at end of file diff --git a/docs/readme.rst b/docs/readme.rst deleted file mode 100644 index 72a33558..00000000 --- a/docs/readme.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../README.rst diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 62cd681e..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -sphinx>=2.0 -sphinx-material -sphinx-click -nbsphinx -sphinx-copybutton -recommonmark -IPython -Cython diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt deleted file mode 100644 index f95eb78d..00000000 --- a/docs/spelling_wordlist.txt +++ /dev/null @@ -1,11 +0,0 @@ -builtin -builtins -classmethod -staticmethod -classmethods -staticmethods -args -kwargs -callstack -Changelog -Indices diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000..5cf33b6d --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,38 @@ +/* Indentation. */ +div.doc-contents:not(.first) { + padding-left: 25px; + border-left: 4px solid rgba(230, 230, 230); + margin-bottom: 80px; +} + +/* Don't capitalize names. */ +h5.doc-heading { + text-transform: none !important; +} + +/* Don't use vertical space on hidden ToC entries. */ +h6.hidden-toc { + margin: 0 !important; + position: relative; + top: -70px; +} +h6.hidden-toc::before { + margin-top: 0 !important; + padding-top: 0 !important; +} + +/* Don't show permalink of hidden ToC entries. */ +h6.hidden-toc a.headerlink { + display: none; +} + +/* Avoid breaking parameters name, etc. in table cells. */ +td code { + word-break: normal !important; +} + +/* For pieces of Markdown rendered in table cells. */ +td p { + margin-top: 0 !important; + margin-bottom: 0 !important; +} diff --git a/docs/stylesheets/termynal.css b/docs/stylesheets/termynal.css new file mode 100644 index 00000000..188ad8a3 --- /dev/null +++ b/docs/stylesheets/termynal.css @@ -0,0 +1,108 @@ +/** + * termynal.js + * + * @author Ines Montani + * @version 0.0.1 + * @license MIT + */ + +:root { + --color-bg: #252a33; + --color-text: #eee; + --color-text-subtle: #a2a2a2; +} + +[data-termynal] { + width: 750px; + max-width: 100%; + background: var(--color-bg); + color: var(--color-text); + font-size: 18px; + /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */ + font-family: 'Roboto Mono', 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; + border-radius: 4px; + padding: 1px 45px 35px; + position: relative; + -webkit-box-sizing: border-box; + box-sizing: border-box; +} + +[data-termynal]:before { + content: ''; + position: absolute; + top: 15px; + left: 15px; + display: inline-block; + width: 15px; + height: 15px; + border-radius: 50%; + /* A little hack to display the window buttons in one pseudo element. */ + background: #d9515d; + -webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930; + box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930; +} + +[data-termynal]:after { + content: 'bash'; + position: absolute; + color: var(--color-text-subtle); + top: 5px; + left: 0; + width: 100%; + text-align: center; +} + +a[data-terminal-control] { + text-align: right; + display: block; + color: #aebbff; +} + +[data-ty] { + display: block; + line-height: 2; +} + +[data-ty]:before { + /* Set up defaults and ensure empty lines are displayed. */ + content: ''; + display: inline-block; + vertical-align: text-top; +} + +[data-ty="input"]:before, +[data-ty-prompt]:before { + margin-right: 0.75em; + color: var(--color-text-subtle); +} + +[data-ty="input"]:before { + content: '$'; +} + +[data-ty][data-ty-prompt]:before { + content: attr(data-ty-prompt); +} + +[data-ty-cursor]:after { + content: attr(data-ty-cursor); + font-family: monospace; + margin-left: 0.5em; + -webkit-animation: blink 1s infinite; + animation: blink 1s infinite; +} + + +/* Cursor animation */ + +@-webkit-keyframes blink { + 50% { + opacity: 0; + } +} + +@keyframes blink { + 50% { + opacity: 0; + } +} diff --git a/docs/tutorial.rst b/docs/tutorial.rst deleted file mode 100644 index 7eb24e7c..00000000 --- a/docs/tutorial.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. _ref-tutorial: - -############### -Hangar Tutorial -############### - -.. toctree:: - :maxdepth: 2 - :titlesonly: - - Tutorial-QuickStart - Tutorial-001 - Tutorial-002 - Tutorial-003 - Tutorial-Dataloader - Tutorial-RealQuickStart diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..dce17132 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,83 @@ +site_name: hangar +site_description: Version control for software 2.0 +site_url: https://hangar-py.readthedocs.io +repo_name: tensorwerk/hangar-py +repo_url: https://github.com/tensorwerk/hangar-py +edit_uri: '' + +extra_css: + - stylesheets/termynal.css + - stylesheets/extra.css +extra_javascript: + - js/termynal.js + - js/custom.js + +theme: + name: material + language: en + +plugins: + - search + - mkdocstrings: + default_handler: python + handlers: + python: + docstring_style: Numpy # this is the default + rendering: + show_source: false + watch: + - src/hangar + - markdownextradata: + data: data + - mkdocs-jupyter: + execute: False + +markdown_extensions: + - markdown.extensions.codehilite: + guess_lang: false + - codehilite + - admonition + - pymdownx.details + - pymdownx.superfences + +extra: + social: + - icon: fontawesome/brands/github-alt + link: https://github.com/tensorwerk/hangar-py + - icon: fontawesome/brands/twitter + link: https://twitter.com/hangardata + - icon: fontawesome/brands/medium + link: https://medium.com/@tensorwerk + - icon: fontawesome/solid/globe + link: https://tensorwerk.com + +nav: + - Home: README.md + - Quickstart: quickstart.md + - Installation: installation.md + - Concepts: concepts.md + - API: api.md + - Tutorials: + - Tutorial-001.ipynb + - Tutorial-002.ipynb + - Tutorial-003.ipynb + - Tutorial-Dataloader.ipynb + - Tutorial-QuickStart.ipynb + - Tutorial-RealQuickStart.ipynb + - Design: design.md + #- CLI: cli.md + - Externals: externals.md + - FAQ: faq.md + - Backends: + - Backend Selection: backends/backends.md + - HDF5_00: backends/hdf5_00.md + - HDF5_01: backends/hdf5_01.md + - NUMPY_10: backends/numpy_10.md + - LMDB_30: backends/lmdb_30.md + - REMOTE_50: backends/remote_50.md + - Contributing: + Contributing: contributing.md + Codeofconduct: codeofconduct.md + Benchmarking: benchmarking.md + - Authors: authors.md + - Changelog: changelog.md diff --git a/src/hangar/bulk_importer.py b/src/hangar/bulk_importer.py index 1369824c..4de8df72 100644 --- a/src/hangar/bulk_importer.py +++ b/src/hangar/bulk_importer.py @@ -161,15 +161,15 @@ def run_bulk_import( Define some arbitrary function (ie "user-defined function" / "UDF") which accepts some arguments and yields data. The UDF must be a generator function, - yielding only values which are of :class:`~.UDF_Return` type. The results + yielding only values which are of :class:`UDF_Return` type. The results yielded by the UDF must be deterministic for a given set of inputs. This - includes all values of the :class:`~.UDF_Return` (``columns`` and ``keys``, + includes all values of the :class:`UDF_Return` (``columns`` and ``keys``, as well as ``data``). A list of input arguments to the UDF must be provided, this is formatted as a sequence (list / tuple) of keyword-arg dictionaries, each of which must be valid when unpacked and bound to the UDF signature. Additionally, all columns - must be specified up front. If any columns are named a :class:`~.UDF_Return` + must be specified up front. If any columns are named a :class:`UDF_Return` which were not pre-specified, the entire operation will fail. Notes @@ -196,20 +196,19 @@ def run_bulk_import( processing time, we recomend trying to yield data pieces which are likely to be unique first from the UDF. - Warnings - -------- + .. warning:: - * Please be aware that these methods should not be executed within a - Jupyter Notebook / Jupyter Lab when running the bulk importer at scale. - The internal implemenation makes significant use of multiprocess Queues - for work distribution and recording. The heavy loads placed on the system - have been observed to place strain on Jupyters ZeroMQ implementation, - resulting in random failures which may or may not even display a traceback - to indicate failure mode. + * Please be aware that these methods should not be executed within a + Jupyter Notebook / Jupyter Lab when running the bulk importer at scale. + The internal implemenation makes significant use of multiprocess Queues + for work distribution and recording. The heavy loads placed on the system + have been observed to place strain on Jupyters ZeroMQ implementation, + resulting in random failures which may or may not even display a traceback + to indicate failure mode. - A small sample set of data can be used within jupyter to test an - implementation without problems, but for full scale operations it is best - run in a script with the operations protected by a ``__main__`` block. + A small sample set of data can be used within jupyter to test an + implementation without problems, but for full scale operations it is best + run in a script with the operations protected by a ``__main__`` block. Examples -------- @@ -217,8 +216,7 @@ def run_bulk_import( >>> import os >>> import numpy as np >>> from PIL import Image - >>> from hangar.bulk_importer import UDF_Return - + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import >>> def image_loader(file_path): ... im = Image.open(file_name) ... arr = np.array(im.resize(512, 512)) @@ -239,13 +237,13 @@ def run_bulk_import( ... {'file_path': '/foo/bird/image_003.jpeg'} ... ] >>> repo = Repository('foo/path/to/repo') - >>> from hangar.bulk_importer import run_bulk_import >>> run_bulk_import( ... repo, branch_name='master', column_names=['file_str', 'image'], ... udf=image_loader, udf_kwargs=udf_kwargs) However, the following will not work, since the output is non-deterministic. + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import >>> def nondeterminstic(x, y): ... first = str(x * y) ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=first) @@ -262,7 +260,7 @@ def run_bulk_import( ... repo, branch_name='master', column_names=['valstr'], ... udf=image_loader, udf_kwargs=udf_kwargs) Traceback (most recent call last): - File "", line 1, in + `File "", line 1, in ` TypeError: contents returned in subbsequent calls to UDF with identical kwargs yielded different results. UDFs MUST generate deterministic results for the given inputs. Input kwargs generating this result: @@ -272,6 +270,8 @@ def run_bulk_import( data pieces yielded can also vary arbitrarily (so long as the results are deterministic for a particular set of inputs) + >>> import numpy as np + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import >>> def maybe_load(x_arr, y_arr, sample_name, columns=['default']): ... for column in columns: ... arr = np.multiply(x_arr, y_arr) From 8cffb218b24390ca3fa244675b2b68d626c8a699 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Mon, 10 Aug 2020 03:40:14 -0400 Subject: [PATCH 2/7] updates to mkapi for numpydoc parsing --- README.md | 67 ++----- docs/README.md | 67 ++----- docs/api.md | 32 ++-- docs/backends/backends.md | 2 +- docs/backends/hdf5_00.md | 2 +- docs/backends/hdf5_01.md | 2 +- docs/backends/lmdb_30.md | 2 +- docs/backends/numpy_10.md | 2 +- docs/backends/remote_50.md | 2 +- docs/cli.md | 2 +- docs/concepts.md | 260 +++++++++++++------------- docs/externals.md | 4 +- mkdocs.yml | 10 +- src/hangar/backends/hdf5_00.py | 85 ++++----- src/hangar/bulk_importer.py | 234 ++++++++++++----------- src/hangar/checkout.py | 92 ++++----- src/hangar/columns/column.py | 38 ++-- src/hangar/columns/layout_flat.py | 22 +-- src/hangar/columns/layout_nested.py | 34 ++-- src/hangar/dataloaders/tfloader.py | 6 +- src/hangar/dataloaders/torchloader.py | 6 +- src/hangar/diff.py | 26 +-- src/hangar/mixins/datasetget.py | 8 +- src/hangar/remotes.py | 51 +++-- src/hangar/repository.py | 56 +++--- 25 files changed, 522 insertions(+), 590 deletions(-) diff --git a/README.md b/README.md index ebc76a63..91899d5c 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,25 @@ Overview ======== -+-----------------------------------+-----------------------------------+ -| docs | | -+-----------------------------------+-----------------------------------+ -| tests | | [![Build Status](https://github | -| | .com/tensorwerk/hangar-py/workflo | -| | ws/Run%20Test%20Suite/badge.svg?b | -| | ranch=master)](https://github.com | -| | /tensorwerk/hangar-py/actions?que | -| | ry=workflow%3A%22Run+Test+Suite%2 | -| | 2+branch%3Amaster+event%3Apush+is | -| | %3Acompleted) | -| | [![Code Coverage](https://codec | -| | ov.io/gh/tensorwerk/hangar-py/bra | -| | nch/master/graph/badge.svg)](http | -| | s://codecov.io/gh/tensorwerk/hang | -| | ar-py) | -| | | [![Language grade: Python](http | -| | s://img.shields.io/lgtm/grade/pyt | -| | hon/g/tensorwerk/hangar-py.svg?lo | -| | go=lgtm&logoWidth=18)](https://lg | -| | tm.com/projects/g/tensorwerk/hang | -| | ar-py/context:python) | -+-----------------------------------+-----------------------------------+ -| package | | [![PyPI Package latest release] | -| | (https://img.shields.io/pypi/v/ha | -| | ngar.svg)](https://pypi.org/proje | -| | ct/hangar) | -| | [![PyPI Wheel](https://img.shie | -| | lds.io/pypi/wheel/hangar.svg)](ht | -| | tps://pypi.org/project/hangar) | -| | [![Conda-Forge Latest Version]( | -| | https://img.shields.io/conda/vn/c | -| | onda-forge/hangar.svg)](https://a | -| | naconda.org/conda-forge/hangar) | -| | | [![Supported versions](https:// | -| | img.shields.io/pypi/pyversions/ha | -| | ngar.svg)](https://pypi.org/proje | -| | ct/hangar) | -| | [![Supported implementations](h | -| | ttps://img.shields.io/pypi/implem | -| | entation/hangar.svg)](https://pyp | -| | i.org/project/hangar) | -| | | [![GitHub license](https://img. | -| | shields.io/github/license/tensorw | -| | erk/hangar-py)](https://github.co | -| | m/tensorwerk/hangar-py/blob/maste | -| | r/LICENSE) | -+-----------------------------------+-----------------------------------+ +Build +----- + +[![Build Status](https://github.com/tensorwerk/hangar-py/workflows/Run%20Test%20Suite/badge.svg?branch=master)](https://github.com/tensorwerk/hangar-py/actions?query=workflow%3A%22Run+Test+Suite%22+branch%3Amaster+event%3Apush+is%3Acompleted) +[![Code Coverage](https://codecov.io/gh/tensorwerk/hangar-py/branch/master/graph/badge.svg)](https://codecov.io/gh/tensorwerk/hangar-py) +[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/tensorwerk/hangar-py.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/tensorwerk/hangar-py/context:python) + +Package +------- + +[![PyPI Package latest release](https://img.shields.io/pypi/v/hangar.svg)](https://pypi.org/project/hangar) +[![PyPI Wheel](https://img.shields.io/pypi/wheel/hangar.svg)](https://pypi.org/project/hangar) +[![Conda-Forge Latest Version](https://img.shields.io/conda/vn/conda-forge/hangar.svg)](https://anaconda.org/conda-forge/hangar) +[![Supported versions](https://img.shields.io/pypi/pyversions/hangar.svg)](https://pypi.org/project/hangar) +[![Supported implementations](https://img.shields.io/pypi/implementation/hangar.svg)](https://pypi.org/project/hangar) +[![GitHub license](https://img.shields.io/github/license/tensorwerk/hangar-py)](https://github.com/tensorwerk/hangar-py/blob/master/LICENSE) + +About +----- Hangar is version control for tensor data. Commit, branch, merge, revert, and collaborate in the data-defined software era. diff --git a/docs/README.md b/docs/README.md index ebc76a63..91899d5c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,54 +1,25 @@ Overview ======== -+-----------------------------------+-----------------------------------+ -| docs | | -+-----------------------------------+-----------------------------------+ -| tests | | [![Build Status](https://github | -| | .com/tensorwerk/hangar-py/workflo | -| | ws/Run%20Test%20Suite/badge.svg?b | -| | ranch=master)](https://github.com | -| | /tensorwerk/hangar-py/actions?que | -| | ry=workflow%3A%22Run+Test+Suite%2 | -| | 2+branch%3Amaster+event%3Apush+is | -| | %3Acompleted) | -| | [![Code Coverage](https://codec | -| | ov.io/gh/tensorwerk/hangar-py/bra | -| | nch/master/graph/badge.svg)](http | -| | s://codecov.io/gh/tensorwerk/hang | -| | ar-py) | -| | | [![Language grade: Python](http | -| | s://img.shields.io/lgtm/grade/pyt | -| | hon/g/tensorwerk/hangar-py.svg?lo | -| | go=lgtm&logoWidth=18)](https://lg | -| | tm.com/projects/g/tensorwerk/hang | -| | ar-py/context:python) | -+-----------------------------------+-----------------------------------+ -| package | | [![PyPI Package latest release] | -| | (https://img.shields.io/pypi/v/ha | -| | ngar.svg)](https://pypi.org/proje | -| | ct/hangar) | -| | [![PyPI Wheel](https://img.shie | -| | lds.io/pypi/wheel/hangar.svg)](ht | -| | tps://pypi.org/project/hangar) | -| | [![Conda-Forge Latest Version]( | -| | https://img.shields.io/conda/vn/c | -| | onda-forge/hangar.svg)](https://a | -| | naconda.org/conda-forge/hangar) | -| | | [![Supported versions](https:// | -| | img.shields.io/pypi/pyversions/ha | -| | ngar.svg)](https://pypi.org/proje | -| | ct/hangar) | -| | [![Supported implementations](h | -| | ttps://img.shields.io/pypi/implem | -| | entation/hangar.svg)](https://pyp | -| | i.org/project/hangar) | -| | | [![GitHub license](https://img. | -| | shields.io/github/license/tensorw | -| | erk/hangar-py)](https://github.co | -| | m/tensorwerk/hangar-py/blob/maste | -| | r/LICENSE) | -+-----------------------------------+-----------------------------------+ +Build +----- + +[![Build Status](https://github.com/tensorwerk/hangar-py/workflows/Run%20Test%20Suite/badge.svg?branch=master)](https://github.com/tensorwerk/hangar-py/actions?query=workflow%3A%22Run+Test+Suite%22+branch%3Amaster+event%3Apush+is%3Acompleted) +[![Code Coverage](https://codecov.io/gh/tensorwerk/hangar-py/branch/master/graph/badge.svg)](https://codecov.io/gh/tensorwerk/hangar-py) +[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/tensorwerk/hangar-py.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/tensorwerk/hangar-py/context:python) + +Package +------- + +[![PyPI Package latest release](https://img.shields.io/pypi/v/hangar.svg)](https://pypi.org/project/hangar) +[![PyPI Wheel](https://img.shields.io/pypi/wheel/hangar.svg)](https://pypi.org/project/hangar) +[![Conda-Forge Latest Version](https://img.shields.io/conda/vn/conda-forge/hangar.svg)](https://anaconda.org/conda-forge/hangar) +[![Supported versions](https://img.shields.io/pypi/pyversions/hangar.svg)](https://pypi.org/project/hangar) +[![Supported implementations](https://img.shields.io/pypi/implementation/hangar.svg)](https://pypi.org/project/hangar) +[![GitHub license](https://img.shields.io/github/license/tensorwerk/hangar-py)](https://github.com/tensorwerk/hangar-py/blob/master/LICENSE) + +About +----- Hangar is version control for tensor data. Commit, branch, merge, revert, and collaborate in the data-defined software era. diff --git a/docs/api.md b/docs/api.md index 933cdb4b..cd6d5dc6 100644 --- a/docs/api.md +++ b/docs/api.md @@ -6,70 +6,70 @@ This is the python API for the Hangar project. Repository ---------- -::: hangar.repository +![mkapi](hangar.repository.Repository) Remotes ------- -::: hangar.remotes.Remotes +![mkapi](hangar.remotes.Remotes) Write Enabled Checkout ---------------------- ### Checkout -::: hangar.checkout.WriterCheckout +![mkapi](hangar.checkout.WriterCheckout) ### Columns -::: hangar.columns.column.Columns +![mkapi](hangar.columns.column.Columns) ### Flat Column Layout Container -::: hangar.columns.layout_flat.FlatSampleWriter +![mkapi](hangar.columns.layout_flat.FlatSampleWriter) ### Nested Column Layout Container -::: hangar.columns.layout_nested.NestedSampleWriter +![mkapi](hangar.columns.layout_nested.NestedSampleWriter) -::: hangar.columns.layout_nested.FlatSubsampleWriter +![mkapi](hangar.columns.layout_nested.FlatSubsampleWriter) ### Differ -::: hangar.diff.WriterUserDiff +![mkapi](hangar.diff.WriterUserDiff) ### Bulk Importer -::: hangar.bulk_importer.run_bulk_import +![mkapi](hangar.bulk_importer.run_bulk_import) Read Only Checkout ------------------ ### Checkout -::: hangar.checkout.ReaderCheckout +![mkapi](hangar.checkout.ReaderCheckout) ### Flat Column Layout Container -::: hangar.columns.layout_flat.FlatSampleReader +![mkapi](hangar.columns.layout_flat.FlatSampleReader) ### Nested Column Layout Container -::: hangar.columns.layout_nested.NestedSampleReader +![mkapi](hangar.columns.layout_nested.NestedSampleReader) -:::hangar.columns.layout_nested.FlatSubsampleReader +![mkapi](hangar.columns.layout_nested.FlatSubsampleReader) ### Differ -::: hangar.diff.ReaderUserDiff +![mkapi](hangar.diff.ReaderUserDiff) ML Framework Dataloaders ------------------------ ### Tensorflow -::: hangar.make_tf_dataset +![mkapi](hangar.make_tf_dataset) ### Pytorch -::: hangar.make_torch_dataset +![mkapi](hangar.make_torch_dataset) diff --git a/docs/backends/backends.md b/docs/backends/backends.md index 1ea73ac9..795787df 100644 --- a/docs/backends/backends.md +++ b/docs/backends/backends.md @@ -11,5 +11,5 @@ Backend selection Any questions or comments can be directed to the [Hangar Github Issues Page](https://github.com/tensorwerk/hangar-py/issues) -::: hangar.backends.__init__ +![mkapi](hangar.backends.__init__) diff --git a/docs/backends/hdf5_00.md b/docs/backends/hdf5_00.md index ecaa0d3a..bc68bda9 100644 --- a/docs/backends/hdf5_00.md +++ b/docs/backends/hdf5_00.md @@ -1,4 +1,4 @@ Local HDF5 Backend ================== -::: hangar.backends.hdf5_00 +![mkapi](hangar.backends.hdf5_00) diff --git a/docs/backends/hdf5_01.md b/docs/backends/hdf5_01.md index 95e68cfe..42d6a02f 100644 --- a/docs/backends/hdf5_01.md +++ b/docs/backends/hdf5_01.md @@ -1,4 +1,4 @@ Fixed Shape Optimized Local HDF5 ================================ -::: hangar.backends.hdf5_01 +![mkapi](hangar.backends.hdf5_01) diff --git a/docs/backends/lmdb_30.md b/docs/backends/lmdb_30.md index 0fe28d29..884fe0ab 100644 --- a/docs/backends/lmdb_30.md +++ b/docs/backends/lmdb_30.md @@ -1,4 +1,4 @@ Variable Shape LMDB String Data Store ===================================== -::: hangar.backends.lmdb_30 +![mkapi](hangar.backends.lmdb_30) diff --git a/docs/backends/numpy_10.md b/docs/backends/numpy_10.md index b98a45f2..2874cb2b 100644 --- a/docs/backends/numpy_10.md +++ b/docs/backends/numpy_10.md @@ -1,4 +1,4 @@ Local NP Memmap Backend ======================= -::: hangar.backends.numpy_10 +![mkapi](hangar.backends.numpy_10) diff --git a/docs/backends/remote_50.md b/docs/backends/remote_50.md index 2fb22095..7de4f5f1 100644 --- a/docs/backends/remote_50.md +++ b/docs/backends/remote_50.md @@ -1,4 +1,4 @@ Remote Server Unknown Backend ============================= -::: hangar.backends.remote_50 +![mkapi](hangar.backends.remote_50) diff --git a/docs/cli.md b/docs/cli.md index 617d036d..57f128d9 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -9,4 +9,4 @@ to be at the same level the repository was initially created in. Simply start by typing `$ hangar --help` in your terminal to get started! -::: hangar.cli +![mkapi](hangar.cli) diff --git a/docs/concepts.md b/docs/concepts.md index b0ff3890..19ba4851 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -35,9 +35,9 @@ The ability of version control systems to perform these tasks for codebases is largely taken for granted by almost every developer today; however, we are in-fact standing on the shoulders of giants, with decades of engineering which has resulted in these phenomenally useful -tools. Now that a new era of \"Data-Defined software\" is taking hold, +tools. Now that a new era of "Data-Defined software" is taking hold, we find there is a strong need for analogous version control systems -which are designed to handle numerical data at large scale\... Welcome +which are designed to handle numerical data at large scale... Welcome to Hangar! Inspiration @@ -66,7 +66,7 @@ There are, however, many fundamental differences in how humans/programs interpret and use text in source files vs. numerical data which raise many questions Hangar needs to uniquely solve: -- How do we connect some piece of \"Data\" with a meaning in the real +- How do we connect some piece of "Data" with a meaning in the real world? - How do we diff and merge large collections of data samples? - How can we resolve conflicts? @@ -85,9 +85,9 @@ How Hangar Thinks About Data ### Abstraction 0: What is a Repository? -A \"Repository\" consists of an historically ordered mapping of -\"Commits\" over time by various \"Committers\" across any number of -\"Branches\". Though there are many conceptual similarities in what a +A "Repository" consists of an historically ordered mapping of +"Commits" over time by various "Committers" across any number of +"Branches". Though there are many conceptual similarities in what a Git repo and a Hangar Repository achieve, Hangar is designed with the express purpose of dealing with numeric data. As such, when you read/write to/from a Repository, the main way of interaction with @@ -101,25 +101,25 @@ level. ### Abstraction 1: What is a Dataset? -Let\'s get philosophical and talk about what a \"Dataset\" is. The word -\"Dataset\" invokes some meaning to humans; a dataset may have a -canonical name (like \"MNIST\" or \"CoCo\"), it will have a source where +Let's get philosophical and talk about what a "Dataset" is. The word +"Dataset" invokes some meaning to humans; a dataset may have a +canonical name (like "MNIST" or "CoCo"), it will have a source where it comes from, (ideally) it has a purpose for some real-world task, it will have people who build, aggregate, and nurture it, and most importantly a Dataset always contains pieces of some type of information -type which describes \"something\". +type which describes "something". -It\'s an abstract definition, but it is only us, the humans behind the -machine, which associate \"Data\" with some meaning in the real world; +It's an abstract definition, but it is only us, the humans behind the +machine, which associate "Data" with some meaning in the real world; it is in the same vein which we associate a group of Data in a -\"Dataset\" with some real world meaning. +"Dataset" with some real world meaning. -Our first abstraction is therefore the \"Dataset\": a collection of +Our first abstraction is therefore the "Dataset": a collection of (potentially groups of) data pieces observing a common form among -instances which act to describe something meaningful. \*To describe some +instances which act to describe something meaningful. *To describe some phenomenon, a dataset may require multiple pieces of information, each of a particular format, for each instance/sample recorded in the -dataset.\* +dataset.* > **For Example** > @@ -141,10 +141,10 @@ together, when thinking about the aggregation of every scan in the group, most of (if not all) of the same information fields are duplicated within each samples. -\*A single scan is a bunch of disparate information stuck together, many +*A single scan is a bunch of disparate information stuck together, many of those put together makes a Dataset, but looking down from the top, we identify pattern of common fields across all items. We call these -groupings of similar typed information:\* **Columns**. +groupings of similar typed information:* **Columns**. ### Abstraction 2: What Makes up a Column? @@ -161,13 +161,13 @@ a `Column` in Hangar, we only need to provide: The individual pieces of information (`Data`) which fully describe some phenomenon via an aggregate mapping access across any number of -\"Columns\" are both individually and collectively referred to as +"Columns" are both individually and collectively referred to as `Samples` in the Hangar vernacular. According to the specification above, all samples contained in a `Column` must be numeric arrays with each having: 1) Same data type (standard `numpy` data types are supported). -2) A shape with each dimension size \<= the shape (`max shape`) set in +2) A shape with each dimension size <= the shape (`max shape`) set in the `column` specification (more on this later). Additionally, samples in a `column` can either be named, or unnamed @@ -176,23 +176,23 @@ Additionally, samples in a `column` can either be named, or unnamed Effective use of Hangar relies on having an understanding of what exactly a `"Sample"` is in a particular `Column`. The most effective way -to find out is to ask: \"What is the smallest piece of data which has a -useful meaning to \'me\' (or \'my\' downstream processes\"). In the +to find out is to ask: "What is the smallest piece of data which has a +useful meaning to 'me' (or 'my' downstream processes"). In the MNIST `column`, this would be a single digit image (a 28x28 array); for a medical `column` it might be an entire (512x320x320) MRI volume scan for a particular patient; while for the NASDAQ Stock Ticker it might be an hours worth of price data points (or less, or more!) The point is -that \*\*when you think about what a `sample` is, it should typically be -the smallest atomic unit of useful information.\*\* +that **when you think about what a `sample` is, it should typically be +the smallest atomic unit of useful information.** ### Abstraction 3: What is Data? -From this point forward, \*\*when we talk about \"Data\" we are actually +From this point forward, **when we talk about "Data" we are actually talking about n-dimensional arrays of numeric information. To Hangar, -\"Data\" is just a collection of numbers being passed into and out of -it.\*\* Data does not have a file type, it does not have a +"Data" is just a collection of numbers being passed into and out of +it.** Data does not have a file type, it does not have a file-extension, it does not mean anything to Hangar itself - it is just -numbers. This theory of \"Data\" is nearly as simple as it gets, and +numbers. This theory of "Data" is nearly as simple as it gets, and this simplicity is what enables us to be unconstrained as we build abstractions and utilities to operate on it. @@ -242,9 +242,9 @@ store or track the data set, just the underlying columns. using cryptographically secure algorithms. - Hangar is very much a young project, until penetration tests and security reviews are performed, we will refrain from stating that - Hangar is fully \"cryptographically secure\". Security experts are - welcome to contact us privately at [hangar.info\@tensorwerk.com - \]{.title-ref}\_\_ to disclose any + Hangar is fully "cryptographically secure". Security experts are + welcome to contact us privately at [hangar.info@tensorwerk.com + ]{.title-ref}__ to disclose any security issues. @@ -254,7 +254,7 @@ Implications of the Hangar Data Philosophy ### The Domain-Specific File Format Problem Though it may seem counterintuitive at first, there is an incredible -amount of freedom (and power) that is gained when \"you\" (the user) +amount of freedom (and power) that is gained when "you" (the user) start to decouple some information container from the data which it actually holds. At the end of the day, the algorithms and systems you use to produce insight from data are just mathematical operations; math @@ -274,12 +274,12 @@ read/write, these schemes require significant amounts of human capital (developer time) to be spent on building, testing, and upkeep/maintenance; all while adding significant complexity for users. Oh, and they also have a strangely high inclination to degenerate into -horrible complexity which essentially becomes \"magic\" after the +horrible complexity which essentially becomes "magic" after the original creators move on. -The Hangar system is quite different in this regards. First, \*\*we +The Hangar system is quite different in this regards. First, **we trust that you know what your data is and what it should be best -represented as\*\*. When writing to a Hangar repository, you process the +represented as**. When writing to a Hangar repository, you process the data into n-dimensional arrays once. Then when you retrieve it you are provided with the same array, in the same shape and datatype (unless you ask for a particular subarray-slice), already initialized in memory and @@ -288,7 +288,7 @@ ready to compute on instantly. #### High Performance From Simplicity Because Hangar is designed to deal (almost exclusively) with numerical -arrays, we are able to \"stand on the shoulders of giants\" once again +arrays, we are able to "stand on the shoulders of giants" once again by utilizing many of the well validated, highly optimized, and community validated numerical array data management utilities developed by the High Performance Computing community over the past few decades. @@ -315,8 +315,8 @@ interface which affects performance and compression of data samples. The choice of backend to store a piece of data is selected automatically from heuristics based on the column specification, system details, and -context of the storage service internal to Hangar. \*\*As a user, this -is completely transparent to you\*\* in all steps of interacting with +context of the storage service internal to Hangar. **As a user, this +is completely transparent to you** in all steps of interacting with the repository. It does not require (or even accept) user specified configuration. @@ -334,63 +334,63 @@ At the time of writing, Hangar has the following backends implemented A common side effect of the [The Domain-Specific File Format Problem](#the-domain-specific-file-format-problem) is that anyone who -wants to work with an organization\'s/project\'s data needs to not only +wants to work with an organization's/project's data needs to not only have some domain expertise (so they can do useful things with the data), but they also need to have a non-trivial understanding of the projects dataset, file format, and access conventions / transformation pipelines. -\*In a world where highly specialized talent is already scarce, this -phenomenon shrinks the pool of available collaborators dramatically.\* +*In a world where highly specialized talent is already scarce, this +phenomenon shrinks the pool of available collaborators dramatically.* -Given this situation, it\'s understandable why when most organizations +Given this situation, it's understandable why when most organizations spend massive amounts of money and time to build a team, collect & annotate data, and build an infrastructure around that information, they hold it for their private use with little regards for how the world could use it together. Businesses rely on proprietary information to stay ahead of their competitors, and because this information is so -difficult (and expensive) to generate, it\'s completely reasonable that +difficult (and expensive) to generate, it's completely reasonable that they should be the ones to benefit from all that work. > **A Thought Experiment** -> -> Imagine that `Git` and `GitHub` didn\'t take over the world. Imagine +> +> Imagine that `Git` and `GitHub` didn't take over the world. Imagine > that the `Diff` and `Patch` Unix tools never existed. Instead, imagine > we were to live in a world where every software project had very > different version control systems (largely homeade by non VCS experts, > & not validated by a community over many years of use). Even worse, -> most of these tools don\'t allow users to easily branch, make changes, -> and automatically merge them back. It shouldn\'t be difficult to +> most of these tools don't allow users to easily branch, make changes, +> and automatically merge them back. It shouldn't be difficult to > imagine how dramatically such a world would contrast to ours today. > Open source software as we know it would hardly exist, and any efforts > would probably be massively fragmented across the web (if there would -> even be a \'web\' that we would recognize in this strange world). +> even be a 'web' that we would recognize in this strange world). > > Without a way to collaborate in the open, open source software would > largely not exist, and we would all be worse off for it. > -> Doesn\'t this hypothetical sound quite a bit like the state of open +> Doesn't this hypothetical sound quite a bit like the state of open > source data collaboration in todays world? The impetus for developing a tool like Hangar is the belief that if it is simple for anyone with domain knowledge to collaboratively curate -columns containing information they care about, then they will.\* Open +columns containing information they care about, then they will.* Open source software development benefits everyone, we believe open source column curation can do the same. -#### How To Overcome The \"Size\" Problem +#### How To Overcome The "Size" Problem Even if the greatest tool imaginable existed to version, branch, and -merge columns, it would face one massive problem which if it didn\'t -solve would kill the project: \*The size of data can very easily exceeds -what can fit on (most) contributors laptops or personal workstations\*. +merge columns, it would face one massive problem which if it didn't +solve would kill the project: *The size of data can very easily exceeds +what can fit on (most) contributors laptops or personal workstations*. This section explains how Hangar can handle working with columns which are prohibitively large to download or store on a single machine. As mentioned in [High Performance From Simplicity](#high-performance-from-simplicity), under the hood Hangar -deals with \"Data\" and \"Bookkeeping\" completely separately. We\'ve +deals with "Data" and "Bookkeeping" completely separately. We've previously covered what exactly we mean by Data in [How Hangar Thinks -About Data](#how-hangar-thinks-about-data), so we\'ll briefly cover the -second major component of Hangar here. In short \"Bookkeeping\" +About Data](#how-hangar-thinks-about-data), so we'll briefly cover the +second major component of Hangar here. In short "Bookkeeping" describes everything about the repository. By everything, we do mean that the Bookkeeping records describe everything: all commits, parents, branches, columns, samples, data descriptors, schemas, commit message, @@ -398,69 +398,72 @@ etc. Though complete, these records are fairly small (tens of MB in size for decently sized repositories with decent history), and are highly compressed for fast transfer between a Hangar client/server. -> **A brief technical interlude** -> -> There is one very important (and rather complex) property which gives -> Hangar Bookeeping massive power: \*\*Existence of some data piece is -> always known to Hangar and stored immutably once committed. However, -> the access pattern, backend, and locating information for this data -> piece may (and over time, will) be unique in every hangar repository -> instance\*\*. -> -> Though the details of how this works is well beyond the scope of this -> document, the following example may provide some insight into the -> implications of this property: -> -> > If you `clone` some hangar repository, Bookeeping says that \"some -> > number of data pieces exist\" and they should retrieved from the -> > server. However, the bookeeping records transfered in a `fetch` / -> > `push` / `clone` operation do not include information about where -> > that piece of data existed on the client (or server) computer. Two -> > synced repositories can use completely different backends to store -> > the data, in completly different locations, and it does not matter - -> > Hangar only guarantees that when collaborators ask for a data sample -> > in some checkout, that they will be provided with identical arrays, -> > not that they will come from the same place or be stored in the same -> > way. Only when data is actually retrieved the \"locating -> > information\" is set for that repository instance. + +!!! info + + **A brief technical interlude** + + There is one very important (and rather complex) property which gives + Hangar Bookeeping massive power: **Existence of some data piece is + always known to Hangar and stored immutably once committed. However, + the access pattern, backend, and locating information for this data + piece may (and over time, will) be unique in every hangar repository + instance**. + + Though the details of how this works is well beyond the scope of this + document, the following example may provide some insight into the + implications of this property: + + If you `clone` some hangar repository, Bookeeping says that "some + number of data pieces exist" and they should retrieved from the + server. However, the bookeeping records transfered in a `fetch` / + `push` / `clone` operation do not include information about where + that piece of data existed on the client (or server) computer. Two + synced repositories can use completely different backends to store + the data, in completly different locations, and it does not matter - + Hangar only guarantees that when collaborators ask for a data sample + in some checkout, that they will be provided with identical arrays, + not that they will come from the same place or be stored in the same + way. Only when data is actually retrieved the "locating + information" is set for that repository instance. Because Hangar makes no assumptions about how/where it should retrieve some piece of data, or even an assumption that it exists on the local machine, and because records are small and completely describe history, once a machine has the Bookkeeping, it can decide what data it actually -wants to materialize on it\'s local disk! These `partial fetch` / +wants to materialize on it's local disk! These `partial fetch` / `partial clone` operations can materialize any desired data, whether it be for a few records at the head branch, for all data in a commit, or for the entire historical data. A future release will even include the ability to stream data directly to a Hangar checkout and materialize the data in memory without having to save it to disk at all! -More importantly: \*\*Since Bookkeeping describes all history, merging +More importantly: **Since Bookkeeping describes all history, merging can be performed between branches which may contain partial (or even no) -actual data.\*\* Aka **you don\'t need data on disk to merge changes -into it.** It\'s an odd concept which will be explained more in depth in +actual data.** Aka **you don't need data on disk to merge changes +into it.** It's an odd concept which will be explained more in depth in the future. -..note : +!!! note To try this out for yourself, please refer to the the API Docs (:ref:`ref-api`) on working with Remotes, especially the ``fetch()`` and ``fetch-data()`` methods. Otherwise look for through our tutorials & examples for more practical info! -#### What Does it Mean to \"Merge\" Data? +#### What Does it Mean to "Merge" Data? -We\'ll start this section, once again, with a comparison to source code +We'll start this section, once again, with a comparison to source code version control systems. When dealing with source code text, merging is performed in order to take a set of changes made to a document, and logically insert the changes into some other version of the document. The goal is to generate a new version of the document with all changes -made to it in a fashion which conforms to the \"change author\'s\" +made to it in a fashion which conforms to the "change author's" intentions. Simply put: the new version is valid and what is expected by the authors. This concept of what it means to merge text does not generally map well -to changes made in a column we\'ll explore why through this section, but +to changes made in a column we'll explore why through this section, but look back to the philosophy of Data outlined in [How Hangar Thinks About Data](#how-hangar-thinks-about-data) for inspiration as we begin. Remember, in the Hangar design a Sample is the smallest array which @@ -477,28 +480,22 @@ actors operations which can occur. Addition -: - -> An operation which creates a column, sample, or some metadata which -> did not previously exist in the relevant branch history. + An operation which creates a column, sample, or some metadata which + did not previously exist in the relevant branch history. Removal -: - -> An operation which removes some column, a sample, or some metadata -> which existed in the parent of the commit under consideration. (Note: -> removing a column also removes all samples contained in it). + An operation which removes some column, a sample, or some metadata + which existed in the parent of the commit under consideration. (Note: + removing a column also removes all samples contained in it). Mutation -: - -> An operation which sets: data to a sample, the value of some metadata -> key, or a column schema, to a different value than what it had -> previously been created with (Note: a column schema mutation is -> observed when a column is removed, and a new column with the same name -> is created with a different dtype/shape, all in the same commit). + An operation which sets: data to a sample, the value of some metadata + key, or a column schema, to a different value than what it had + previously been created with (Note: a column schema mutation is + observed when a column is removed, and a new column with the same name + is created with a different dtype/shape, all in the same commit). ##### Merging Changes @@ -514,21 +511,24 @@ as well. If these changes are identical, they are compatible, but what if they are not? In the following example, we diff and merge each element of the sample array like we would text: - Merge ?? - -> commit A commit B Does combining mean anything? - -> \[\[0, 1, 2\], \[\[0, 1, 2\], \[\[1, 1, 1\], -> -> : \[0, 1, 2\], \-\-\-\--\> \[2, 2, 2\], \-\-\-\-\-\-\-\-\-\-\--\> -> \[2, 2, 2\], \[0, 1, 2\]\] \[3, 3, 3\]\] / \[3, 3, 3\]\] / commit -> C / / \[\[1, 1, 1\], / \-\-\-\-\-\--\> \[0, 1, 2\], \[0, 1, 2\]\] -> + Merge ?? + commit A commit B Does combining mean anything? + + [[0, 1, 2], [[0, 1, 2], [[1, 1, 1], + [0, 1, 2], -----> [2, 2, 2], ------------> [2, 2, 2], + [0, 1, 2]] [3, 3, 3]] / [3, 3, 3]] + \ / + \ commit C / + \ / + \ [[1, 1, 1], / + -------> [0, 1, 2], + [0, 1, 2]] + We see that a result can be generated, and can agree if this was a piece -of text, the result would be correct. Don\'t be fooled, this is an +of text, the result would be correct. Don't be fooled, this is an abomination and utterly wrong/meaningless. Remember we said earlier `"the result of a merge should conform to the intentions of each author"`. -This merge result conforms to neither author\'s intention. The value of +This merge result conforms to neither author's intention. The value of an array element is not isolated, every value affects how the entire sample is understood. The values at Commit B or commit C may be fine on their own, but if two samples are mutated independently with @@ -537,14 +537,18 @@ authors. This is the actual behavior of Hangar. - commit A commit B - -> \[\[0, 1, 2\], \[\[0, 1, 2\], -> -> : \[0, 1, 2\], \-\-\-\--\> \[2, 2, 2\], \-\-\-\-- MERGE CONFLICT -> \[0, 1, 2\]\] \[3, 3, 3\]\] / / commit C / / \[\[1, 1, 1\], / -> \-\-\-\-\-\--\> \[0, 1, 2\], \[0, 1, 2\]\] -> + commit A commit B + + [[0, 1, 2], [[0, 1, 2], + [0, 1, 2], -----> [2, 2, 2], ----- MERGE CONFLICT + [0, 1, 2]] [3, 3, 3]] / + \ / + \ commit C / + \ / + \ [[1, 1, 1], / + -------> [0, 1, 2], + [0, 1, 2]] + When a conflict is detected, the merge author must either pick a sample from one of the commits or make changes in one of the branches such that the conflicting sample values are resolved. @@ -572,7 +576,7 @@ general these fall into 4 categories: the schema specification is checked for compatibility in a method custom to the internal workings of Hangar. -What\'s Next? +What's Next? ------------- - Get started using Hangar today: `ref_installation`{.interpreted-text diff --git a/docs/externals.md b/docs/externals.md index 8ce4f86a..4c520c34 100644 --- a/docs/externals.md +++ b/docs/externals.md @@ -6,9 +6,9 @@ High level interaction interface between hangar and everything external. High Level Methods ------------------ -::: hangar.external._external +![mkapi](hangar.external._external) Plugin System ------------- -::: hangar.external.base_plugin +![mkapi](hangar.external.base_plugin) diff --git a/mkdocs.yml b/mkdocs.yml index dce17132..60b3d162 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,19 +18,11 @@ theme: plugins: - search - - mkdocstrings: - default_handler: python - handlers: - python: - docstring_style: Numpy # this is the default - rendering: - show_source: false - watch: - - src/hangar - markdownextradata: data: data - mkdocs-jupyter: execute: False + - mkapi markdown_extensions: - markdown.extensions.codehilite: diff --git a/src/hangar/backends/hdf5_00.py b/src/hangar/backends/hdf5_00.py index 2f7e49b1..55d79bd1 100644 --- a/src/hangar/backends/hdf5_00.py +++ b/src/hangar/backends/hdf5_00.py @@ -80,6 +80,46 @@ "most compression" +Technical Details +----------------- +- Files are read only after initial creation/writes. Only a write-enabled + checkout can open a HDF5 file in ``"w"`` or ``"a"`` mode, and writer + checkouts create new files on every checkout, and make no attempt to fill in + unset locations in previous files. This is not an issue as no disk space is + used until data is written to the initially created "zero-initialized" + collection datasets + +- On write: Single Writer Multiple Reader (``SWMR``) mode is set to ensure that + improper closing (not calling ``.close()``) method does not corrupt any data + which had been previously flushed to the file. + +- On read: SWMR is set to allow multiple readers (in different threads / + processes) to read from the same file. File handle serialization is handled + via custom python ``pickle`` serialization/reduction logic which is + implemented by the high level ``pickle`` reduction ``__set_state__()``, + ``__get_state__()`` class methods. + +- An optimization is performed in order to increase the read / write + performance of variable shaped datasets. Due to the way that we initialize + an entire HDF5 file with all datasets pre-created (to the size of the max + subarray shape), we need to ensure that storing smaller sized arrays (in a + variable sized Hangar Column) would be effective. Because we use chunked + storage, certain dimensions which are incomplete could have potentially + required writes to chunks which do are primarily empty (worst case "C" index + ordering), increasing read / write speeds significantly. + + To overcome this, we create HDF5 datasets which have ``COLLECTION_SIZE`` + first dimension size, and only ONE second dimension of size + ``schema_shape.size()`` (ie. product of all dimensions). For example an + array schema with shape (10, 10, 3) would be stored in a HDF5 dataset of + shape (COLLECTION_SIZE, 300). Chunk sizes are chosen to align on the first + dimension with a second dimension of size which fits the total data into L2 + CPU Cache (< 256 KB). On write, we use the ``np.ravel`` function to + construct a "view" (not copy) of the array as a 1D array, and then on read + we reshape the array to the recorded size (a copyless "view-only" + operation). This is part of the reason that we only accept C ordered arrays + as input to Hangar. + Record Format ============= @@ -93,10 +133,8 @@ * Dataset Index (``0:COLLECTION_SIZE`` dataset subarray selection) * Subarray Shape - Examples -------- - 1) Adding the first piece of data to a file: * Array shape (Subarray Shape): (10, 10) @@ -107,7 +145,7 @@ ``Record Data => "00:rlUK3C:8067007c0f05c359:16:105:10 10"`` -1) Adding to a piece of data to a the middle of a file: +2) Adding to a piece of data to a the middle of a file: * Array shape (Subarray Shape): (20, 2, 3) * File UID: "rlUK3C" @@ -117,47 +155,6 @@ ``Record Data => "00:rlUK3C:b89f873d3d153a9c:8:199:20 2 3"`` - -Technical Notes -=============== - -* Files are read only after initial creation/writes. Only a write-enabled - checkout can open a HDF5 file in ``"w"`` or ``"a"`` mode, and writer - checkouts create new files on every checkout, and make no attempt to fill in - unset locations in previous files. This is not an issue as no disk space is - used until data is written to the initially created "zero-initialized" - collection datasets - -* On write: Single Writer Multiple Reader (``SWMR``) mode is set to ensure that - improper closing (not calling ``.close()``) method does not corrupt any data - which had been previously flushed to the file. - -* On read: SWMR is set to allow multiple readers (in different threads / - processes) to read from the same file. File handle serialization is handled - via custom python ``pickle`` serialization/reduction logic which is - implemented by the high level ``pickle`` reduction ``__set_state__()``, - ``__get_state__()`` class methods. - -* An optimization is performed in order to increase the read / write - performance of variable shaped datasets. Due to the way that we initialize - an entire HDF5 file with all datasets pre-created (to the size of the max - subarray shape), we need to ensure that storing smaller sized arrays (in a - variable sized Hangar Column) would be effective. Because we use chunked - storage, certain dimensions which are incomplete could have potentially - required writes to chunks which do are primarily empty (worst case "C" index - ordering), increasing read / write speeds significantly. - - To overcome this, we create HDF5 datasets which have ``COLLECTION_SIZE`` - first dimension size, and only ONE second dimension of size - ``schema_shape.size()`` (ie. product of all dimensions). For example an - array schema with shape (10, 10, 3) would be stored in a HDF5 dataset of - shape (COLLECTION_SIZE, 300). Chunk sizes are chosen to align on the first - dimension with a second dimension of size which fits the total data into L2 - CPU Cache (< 256 KB). On write, we use the ``np.ravel`` function to - construct a "view" (not copy) of the array as a 1D array, and then on read - we reshape the array to the recorded size (a copyless "view-only" - operation). This is part of the reason that we only accept C ordered arrays - as input to Hangar. """ import logging import os diff --git a/src/hangar/bulk_importer.py b/src/hangar/bulk_importer.py index 4de8df72..16e14151 100644 --- a/src/hangar/bulk_importer.py +++ b/src/hangar/bulk_importer.py @@ -116,11 +116,11 @@ class UDF_Return(NamedTuple): Attributes ---------- - column: str + column column name to place data into - key: Union[KeyType, Tuple[KeyType, KeyType]] + key key to place flat sample into, or 2-tuple of keys for nested samples - data: Union[np.ndarray, str, bytes] + data piece of data to place in the column with the provided key. """ column: str @@ -169,146 +169,144 @@ def run_bulk_import( A list of input arguments to the UDF must be provided, this is formatted as a sequence (list / tuple) of keyword-arg dictionaries, each of which must be valid when unpacked and bound to the UDF signature. Additionally, all columns - must be specified up front. If any columns are named a :class:`UDF_Return` + must be specified up front. If any columns are named a `UDF_Return` which were not pre-specified, the entire operation will fail. - Notes - ----- + !!! note + + - This is an all-or-nothing operation, either all data is successfully + read, validated, and written to the storage backends, or none of it + is. A single maleformed key or data type/shape will cause the entire + import operation to abort. + + - The input kwargs should be fairly small (of no consequence to load + into memory), data out should be large. The results of the UDF + will only be stored in memory for a very short period (just the time + it takes to be validated against the column schema and compressed / + flushed to disk). - * This is an all-or-nothing operation, either all data is successfully - read, validated, and written to the storage backends, or none of it - is. A single maleformed key or data type/shape will cause the entire - import operation to abort. - - * The input kwargs should be fairly small (of no consequence to load - into memory), data out should be large. The results of the UDF - will only be stored in memory for a very short period (just the time - it takes to be validated against the column schema and compressed / - flushed to disk). - - * Every step of the process is executed as a generator, lazily loading - data the entire way. If possible, we recomend writing the UDF such that - data is not allocated in memory before it is ready to be yielded. - - * If it is possible, the task recipe will be pruned and optimized in such - a way that iteration over the UDF will be short circuted during the - second pass (writing data to the backend). As this can greatly reduce - processing time, we recomend trying to yield data pieces which are likely - to be unique first from the UDF. - - .. warning:: - - * Please be aware that these methods should not be executed within a - Jupyter Notebook / Jupyter Lab when running the bulk importer at scale. - The internal implemenation makes significant use of multiprocess Queues - for work distribution and recording. The heavy loads placed on the system - have been observed to place strain on Jupyters ZeroMQ implementation, - resulting in random failures which may or may not even display a traceback - to indicate failure mode. - - A small sample set of data can be used within jupyter to test an - implementation without problems, but for full scale operations it is best - run in a script with the operations protected by a ``__main__`` block. + - Every step of the process is executed as a generator, lazily loading + data the entire way. If possible, we recomend writing the UDF such that + data is not allocated in memory before it is ready to be yielded. + + - If it is possible, the task recipe will be pruned and optimized in such + a way that iteration over the UDF will be short circuted during the + second pass (writing data to the backend). As this can greatly reduce + processing time, we recomend trying to yield data pieces which are likely + to be unique first from the UDF. + + !!! warning + + Please be aware that these methods should not be executed within a + Jupyter Notebook / Jupyter Lab when running the bulk importer at scale. + The internal implemenation makes significant use of multiprocess Queues + for work distribution and recording. The heavy loads placed on the system + have been observed to place strain on Jupyters ZeroMQ implementation, + resulting in random failures which may or may not even display a traceback + to indicate failure mode. + + A small sample set of data can be used within jupyter to test an + implementation without problems, but for full scale operations it is best + run in a script with the operations protected by a ``__main__`` block. Examples -------- - - >>> import os - >>> import numpy as np - >>> from PIL import Image - >>> from hangar.bulk_importer import UDF_Return, run_bulk_import - >>> def image_loader(file_path): - ... im = Image.open(file_name) - ... arr = np.array(im.resize(512, 512)) - ... im_record = UDF_Return(column='image', key=(category, sample), data=arr) - ... yield im_record - ... - ... root, sample_file = os.path.split(file_path) - ... category = os.path.dirname(root) - ... sample_name, _ = os.path.splitext(sample_file) - ... path_record = UDF_Return(column='file_str', key=(category, sample_name), data=file_path) - ... yield path_record - ... - >>> udf_kwargs = [ - ... {'file_path': '/foo/cat/image_001.jpeg'}, - ... {'file_path': '/foo/cat/image_002.jpeg'}, - ... {'file_path': '/foo/dog/image_001.jpeg'}, - ... {'file_path': '/foo/bird/image_011.jpeg'}, - ... {'file_path': '/foo/bird/image_003.jpeg'} - ... ] - >>> repo = Repository('foo/path/to/repo') - >>> run_bulk_import( - ... repo, branch_name='master', column_names=['file_str', 'image'], - ... udf=image_loader, udf_kwargs=udf_kwargs) + >>> import os + >>> import numpy as np + >>> from PIL import Image + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import + >>> def image_loader(file_path): + ... im = Image.open(file_name) + ... arr = np.array(im.resize(512, 512)) + ... im_record = UDF_Return(column='image', key=(category, sample), data=arr) + ... yield im_record + ... + ... root, sample_file = os.path.split(file_path) + ... category = os.path.dirname(root) + ... sample_name, _ = os.path.splitext(sample_file) + ... path_record = UDF_Return(column='file_str', key=(category, sample_name), data=file_path) + ... yield path_record + ... + >>> udf_kwargs = [ + ... {'file_path': '/foo/cat/image_001.jpeg'}, + ... {'file_path': '/foo/cat/image_002.jpeg'}, + ... {'file_path': '/foo/dog/image_001.jpeg'}, + ... {'file_path': '/foo/bird/image_011.jpeg'}, + ... {'file_path': '/foo/bird/image_003.jpeg'} + ... ] + >>> repo = Repository('foo/path/to/repo') + >>> run_bulk_import( + ... repo, branch_name='master', column_names=['file_str', 'image'], + ... udf=image_loader, udf_kwargs=udf_kwargs) However, the following will not work, since the output is non-deterministic. - >>> from hangar.bulk_importer import UDF_Return, run_bulk_import - >>> def nondeterminstic(x, y): - ... first = str(x * y) - ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=first) - ... - ... second = str(x * y * random()) - ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=second) - ... - >>> udf_kwargs = [ - ... {'x': 1, 'y': 2}, - ... {'x': 1, 'y': 3}, - ... {'x': 2, 'y': 4}, - ... ] - >>> run_bulk_import( - ... repo, branch_name='master', column_names=['valstr'], - ... udf=image_loader, udf_kwargs=udf_kwargs) - Traceback (most recent call last): - `File "", line 1, in ` - TypeError: contents returned in subbsequent calls to UDF with identical - kwargs yielded different results. UDFs MUST generate deterministic - results for the given inputs. Input kwargs generating this result: - {'x': 1, 'y': 2}. + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import + >>> def nondeterminstic(x, y): + ... first = str(x * y) + ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=first) + ... + ... second = str(x * y * random()) + ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=second) + ... + >>> udf_kwargs = [ + ... {'x': 1, 'y': 2}, + ... {'x': 1, 'y': 3}, + ... {'x': 2, 'y': 4}, + ... ] + >>> run_bulk_import( + ... repo, branch_name='master', column_names=['valstr'], + ... udf=image_loader, udf_kwargs=udf_kwargs) + Traceback (most recent call last): + `File "", line 1, in ` + TypeError: contents returned in subbsequent calls to UDF with identical + kwargs yielded different results. UDFs MUST generate deterministic + results for the given inputs. Input kwargs generating this result: + {'x': 1, 'y': 2}. Not all columns must be returned from every input to the UDF, the number of data pieces yielded can also vary arbitrarily (so long as the results are deterministic for a particular set of inputs) - >>> import numpy as np - >>> from hangar.bulk_importer import UDF_Return, run_bulk_import - >>> def maybe_load(x_arr, y_arr, sample_name, columns=['default']): - ... for column in columns: - ... arr = np.multiply(x_arr, y_arr) - ... yield UDF_Return(column=column, key=sample_name, data=arr) - ... # - ... # do some strange processing which only outputs another column sometimes - ... if len(columns) == 1: - ... other = np.array(x_arr.shape) * np.array(y_arr.shape) - ... yield UDF_Return(column='strange_column', key=sample_name, data=other) - ... - >>> udf_kwargs = [ - ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_1'}, - ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_2', 'columns': ['foo', 'bar', 'default']}, - ... {'x_arr': np.arange(10) * 2, 'y_arr': np.arange(10), 'sample_name': 'sample_3'}, - ... ] - >>> run_bulk_import( - ... repo, branch_name='master', - ... column_names=['default', 'foo', 'bar', 'strange_column'], - ... udf=maybe_load, udf_kwargs=udf_kwargs) + >>> import numpy as np + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import + >>> def maybe_load(x_arr, y_arr, sample_name, columns=['default']): + ... for column in columns: + ... arr = np.multiply(x_arr, y_arr) + ... yield UDF_Return(column=column, key=sample_name, data=arr) + ... # + ... # do some strange processing which only outputs another column sometimes + ... if len(columns) == 1: + ... other = np.array(x_arr.shape) * np.array(y_arr.shape) + ... yield UDF_Return(column='strange_column', key=sample_name, data=other) + ... + >>> udf_kwargs = [ + ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_1'}, + ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_2', 'columns': ['foo', 'bar', 'default']}, + ... {'x_arr': np.arange(10) * 2, 'y_arr': np.arange(10), 'sample_name': 'sample_3'}, + ... ] + >>> run_bulk_import( + ... repo, branch_name='master', + ... column_names=['default', 'foo', 'bar', 'strange_column'], + ... udf=maybe_load, udf_kwargs=udf_kwargs) Parameters ---------- - repo : 'Repository' + repo Initialized repository object to import data into. - branch_name : str + branch_name Name of the branch to checkout and import data into. - column_names : List[str] + column_names Names of all columns which data should be saved to. - udf : UDF_T + udf User-Defined Function (generator style; yielding an arbitrary number of values when iterated on) which is passed an unpacked kwarg dict as input and yields a single :class:`~.UDF_Return` instance at a time when iterated over. Cannot contain - udf_kwargs : List[dict] + udf_kwargs A sequence of keyword argument dictionaries which are individually unpacked as inputs into the user-defined function (UDF). the keyword argument dictionaries - ncpus : int, optional, default=0 + ncpus Number of Parallel processes to read data files & write to hangar backend stores in. If <= 0, then the default is set to ``num_cpus / 2``. The value of this parameter should never exceed the total CPU count of the system. Import time @@ -316,7 +314,7 @@ def run_bulk_import( memory usage of the ``UDF`` function and backend storage writer processes against the total system memory. generally increase linearly up to - autocommit : bool, optional, default=True + autocommit Control whether a commit should be made after successfully importing the specified data to the staging area of the branch. """ diff --git a/src/hangar/checkout.py b/src/hangar/checkout.py index 3bdaf3f8..b40e7deb 100644 --- a/src/hangar/checkout.py +++ b/src/hangar/checkout.py @@ -88,17 +88,17 @@ def __init__(self, Parameters ---------- - base_path : Path + base_path directory path to the Hangar repository on disk - dataenv : lmdb.Environment + dataenv db where the checkout record data is unpacked and stored. - hashenv : lmdb.Environment + hashenv db where the hash records are stored. - branchenv : lmdb.Environment + branchenv db where the branch records are stored. - refenv : lmdb.Environment + refenv db where the commit references are stored. - commit : str + commit specific commit hash to checkout """ self._commit_hash = commit @@ -265,18 +265,18 @@ def log(self, Parameters ---------- - branch : str, optional + branch The name of the branch to start the log process from. (Default value = None) - commit : str, optional + commit The commit hash to start the log process from. (Default value = None) - return_contents : bool, optional, kwarg only + return_contents If true, return the commit graph specifications in a dictionary suitable for programatic access/evaluation. - show_time : bool, optional, kwarg only + show_time If true and return_contents is False, show the time of each commit on the printed log graph - show_user : bool, optional, kwarg only + show_user If true and return_contents is False, show the committer of each commit on the printed log graph Returns @@ -369,22 +369,22 @@ def __init__(self, Parameters ---------- - repo_pth : Path + repo_pth local file path of the repository. - branch_name : str + branch_name name of the branch whose ``HEAD`` commit will for the starting state of the staging area. - hashenv lmdb.Environment + hashen db where the hash records are stored. - refenv : lmdb.Environment + refenv db where the commit record data is unpacked and stored. - stageenv : lmdb.Environment + stageenv db where the stage record data is unpacked and stored. - branchenv : lmdb.Environment + branchenv db where the head record data is unpacked and stored. - stagehashenv: lmdb.Environment + stagehashenv db where the staged hash record data is stored. - mode : str, optional + mode open in write or read only mode, default is 'a' which is write-enabled. """ self._enter_count = 0 @@ -641,18 +641,18 @@ def log(self, Parameters ---------- - branch : str, optional + branch The name of the branch to start the log process from. (Default value = None) - commit : str, optional + commit The commit hash to start the log process from. (Default value = None) - return_contents : bool, optional, kwarg only + return_contents If true, return the commit graph specifications in a dictionary suitable for programatic access/evaluation. - show_time : bool, optional, kwarg only + show_time If true and return_contents is False, show the time of each commit on the printed log graph - show_user : bool, optional, kwarg only + show_user If true and return_contents is False, show the committer of each commit on the printed log graph Returns @@ -699,19 +699,19 @@ def add_str_column(self, Parameters ---------- - name : str + name Name assigned to the column - contains_subsamples : bool, optional + contains_subsamples True if the column column should store data in a nested structure. In this scheme, a sample key is used to index an arbitrary number of subsamples which map some (sub)key to a piece of data. If False, sample keys map directly to a single piece of data; essentially acting as a single level key/value store. By default, False. - backend : Optional[str], optional + backend ADVANCED USERS ONLY, backend format code to use for column data. If None, automatically inferred and set based on data shape and type. by default None - backend_options : Optional[dict], optional + backend_options ADVANCED USERS ONLY, filter opts to apply to column data. If None, automatically inferred and set based on data shape and type. by default None @@ -780,19 +780,19 @@ def add_bytes_column(self, Parameters ---------- - name : str + name Name assigned to the column - contains_subsamples : bool, optional + contains_subsamples True if the column column should store data in a nested structure. In this scheme, a sample key is used to index an arbitrary number of subsamples which map some (sub)key to a piece of data. If False, sample keys map directly to a single piece of data; essentially acting as a single level key/value store. By default, False. - backend : Optional[str], optional + backend ADVANCED USERS ONLY, backend format code to use for column data. If None, automatically inferred and set based on data shape and type. by default None - backend_options : Optional[dict], optional + backend_options ADVANCED USERS ONLY, filter opts to apply to column data. If None, automatically inferred and set based on data shape and type. by default None @@ -867,36 +867,36 @@ def add_ndarray_column(self, Parameters ---------- - name : str + name The name assigned to this column. - shape : Optional[Union[int, Tuple[int]]] + shape The shape of the data samples which will be written in this column. This argument and the `dtype` argument are required if a `prototype` is not provided, defaults to None. - dtype : Optional[:class:`numpy.dtype`] + dtype The datatype of this column. This argument and the `shape` argument are required if a `prototype` is not provided., defaults to None. - prototype : Optional[:class:`numpy.ndarray`] + prototype A sample array of correct datatype and shape which will be used to initialize the column storage mechanisms. If this is provided, the `shape` and `dtype` arguments must not be set, defaults to None. - variable_shape : bool, optional + variable_shape If this is a variable sized column. If true, a the maximum shape is set from the provided ``shape`` or ``prototype`` argument. Any sample added to the column can then have dimension sizes <= to this initial specification (so long as they have the same rank as what was specified) defaults to False. - contains_subsamples : bool, optional + contains_subsamples True if the column column should store data in a nested structure. In this scheme, a sample key is used to index an arbitrary number of subsamples which map some (sub)key to some piece of data. If False, sample keys map directly to a single piece of data; essentially acting as a single level key/value store. By default, False. - backend : Optional[str], optional + backend ADVANCED USERS ONLY, backend format code to use for column data. If None, automatically inferred and set based on data shape and type. by default None - backend_options : Optional[dict], optional + backend_options ADVANCED USERS ONLY, filter opts to apply to column data. If None, automatically inferred and set based on data shape and type. by default None @@ -964,12 +964,12 @@ def _initialize_new_column(self, Parameters ---------- - column_name: str + column_name name of the column - column_layout: str + column_layout One of ['flat', 'nested'] indicating column layout class to use during generation. - schema: ColumnBase + schema schema class instance providing column data spec, schema/column digest, data validator / hashing methods, and backend ID / options; all of which are needed to successfully create & save the column instance @@ -1014,9 +1014,9 @@ def merge(self, message: str, dev_branch: str) -> str: Parameters ---------- - message : str + message commit message to attach to a three-way merge - dev_branch : str + dev_branch name of the branch which should be merge into this branch (ie `master`) @@ -1060,7 +1060,7 @@ def commit(self, commit_message: str) -> str: Parameters ---------- - commit_message : str, optional + commit_message user proved message for a log of what was changed in this commit. Should a fast forward commit be possible, this will NOT be added to fast-forward ``HEAD``. diff --git a/src/hangar/columns/column.py b/src/hangar/columns/column.py index f6525243..74331f91 100644 --- a/src/hangar/columns/column.py +++ b/src/hangar/columns/column.py @@ -59,21 +59,21 @@ def __init__(self, Parameters ---------- - mode : str + mode one of 'r' or 'a' to indicate read or write mode - repo_pth : Path + repo_pth path to the repository on disk - columns : Mapping[str, Union[ArraysetDataReader, ArraysetDataWriter]] + columns dictionary of ArraysetData objects - hashenv : Optional[lmdb.Environment] + hashenv environment handle for hash records - dataenv : Optional[lmdb.Environment] + dataenv environment handle for the unpacked records. `data` is means to refer to the fact that the stageenv is passed in for for write-enabled, and a cmtrefenv for read-only checkouts. - stagehashenv : Optional[lmdb.Environment] + stagehashenv environment handle for newly added staged data hash records. - txnctx: Optional[ColumnTxn] + txnctx class implementing context managers to handle lmdb transactions """ self._stack: Optional[ExitStack] = None @@ -156,7 +156,7 @@ def __getitem__(self, key: str) -> ModifierTypes: Parameters ---------- - key : string + key name of the column object to get. Returns @@ -176,7 +176,7 @@ def __contains__(self, key: str) -> bool: Parameters ---------- - key : str + key name of the column to check for Returns @@ -301,7 +301,7 @@ def get(self, name: str) -> ModifierTypes: Parameters ---------- - name : str + name name of the column to return Returns @@ -320,7 +320,7 @@ def __delitem__(self, key: str) -> str: Parameters ---------- - key : str + key Name of the column to remove from the repository. This will remove all records from the staging area (though the actual data and all records are still accessible) if they were previously committed. @@ -346,7 +346,7 @@ def delete(self, column: str) -> str: Parameters ---------- - column : str + column name of the column to remove Returns @@ -404,13 +404,13 @@ def _from_staging_area(cls, repo_pth, hashenv, stageenv, stagehashenv): Parameters ---------- - repo_pth : Path + repo_pth directory path to the hangar repository on disk - hashenv : lmdb.Environment + hashenv environment where tensor data hash records are open in write mode. - stageenv : lmdb.Environment + stageenv environment where staging records (dataenv) are opened in write mode. - stagehashenv : lmdb.Environment + stagehashenv environment where the staged hash records are stored in write mode Returns @@ -465,11 +465,11 @@ def _from_commit(cls, repo_pth, hashenv, cmtrefenv): Parameters ---------- - repo_pth : Path + repo_pth directory path to the hangar repository on disk - hashenv : lmdb.Environment + hashenv environment where tensor data hash records are open in read-only mode. - cmtrefenv : lmdb.Environment + cmtrefenv environment where staging checkout records are opened in read-only mode. Returns diff --git a/src/hangar/columns/layout_flat.py b/src/hangar/columns/layout_flat.py index 54d866a1..2d95d9b4 100644 --- a/src/hangar/columns/layout_flat.py +++ b/src/hangar/columns/layout_flat.py @@ -207,7 +207,7 @@ def __getitem__(self, key: KeyType): Parameters ---------- - key : KeyType + key Sample key to retrieve from the column. Returns @@ -228,10 +228,10 @@ def get(self, key: KeyType, default=None): Parameters ---------- - key : KeyType + key The name of the subsample(s) to retrieve. Passing a single subsample key will return the stored data value. - default : Any + default if a `key` parameter is not found, then return this value instead. By default, None. @@ -343,7 +343,7 @@ def _mode_local_aware_key_looper(self, local: bool) -> Iterable[KeyType]: Parameters ---------- - local : bool + local True if keys should be returned which only exist on the local machine. Fale if remote sample keys should be excluded. @@ -368,7 +368,7 @@ def keys(self, local: bool = False) -> Iterable[KeyType]: Parameters ---------- - local : bool, optional + local If True, returned keys will only correspond to data which is available for reading on the local disk, by default False. @@ -384,7 +384,7 @@ def values(self, local: bool = False) -> Iterable[Any]: Parameters ---------- - local : bool, optional + local If True, returned values will only correspond to data which is available for reading on the local disk. No attempt will be made to read data existing on a remote server, by default False. @@ -402,7 +402,7 @@ def items(self, local: bool = False) -> Iterable[Tuple[KeyType, Any]]: Parameters ---------- - local : bool, optional + local If True, returned keys/values will only correspond to data which is available for reading on the local disk, No attempt will be made to read data existing on a remote server, by default False. @@ -627,7 +627,7 @@ def __delitem__(self, key: KeyType) -> None: Parameters ---------- - key : KeyType + key Name of the sample to remove from the column. """ with ExitStack() as stack: @@ -653,7 +653,7 @@ def pop(self, key: KeyType): Parameters ---------- - key : KeysType + key Sample key to remove Returns @@ -681,9 +681,9 @@ def change_backend(self, backend: str, backend_options: Optional[dict] = None): Parameters ---------- - backend : str + backend Backend format code to swtich to. - backend_options : Optional[dict] + backend_options Backend option specification to use (if specified). If left to default value of None, then default options for backend are automatically used. diff --git a/src/hangar/columns/layout_nested.py b/src/hangar/columns/layout_nested.py index 17728120..f45c899d 100644 --- a/src/hangar/columns/layout_nested.py +++ b/src/hangar/columns/layout_nested.py @@ -149,7 +149,7 @@ def __getitem__(self, key: GetKeysType) -> Union[Any, Dict[KeyType, Any]]: Parameters ---------- - key : GetKeysType + key Sample key to retrieve from the column. Alternatively, ``slice`` syntax can be used to retrieve a selection of subsample keys/values. An empty slice (``: == slice(None)``) or ``Ellipsis`` @@ -239,7 +239,7 @@ def _mode_local_aware_key_looper(self, local: bool) -> Iterable[KeyType]: Parameters ---------- - local : bool + local True if keys should be returned which only exist on the local machine. False if remote sample keys should be excluded. @@ -293,7 +293,7 @@ def keys(self, local: bool = False) -> Iterable[KeyType]: Parameters ---------- - local : bool, optional + local If True, returned keys will only correspond to data which is available for reading on the local disk, by default False. @@ -309,7 +309,7 @@ def values(self, local: bool = False) -> Iterable[Any]: Parameters ---------- - local : bool, optional + local If True, returned values will only correspond to data which is available for reading on the local disk. No attempt will be made to read data existing on a remote server, by default False. @@ -327,7 +327,7 @@ def items(self, local: bool = False) -> Iterable[Tuple[KeyType, Any]]: Parameters ---------- - local : bool, optional + local If True, returned keys/values will only correspond to data which is available for reading on the local disk, No attempt will be made to read data existing on a remote server, by default False. @@ -345,7 +345,7 @@ def get(self, key: KeyType, default=None): Parameters ---------- - key : GetKeysType + key The name of the subsample(s) to retrieve. Passing a single subsample key will return the stored :class:`numpy.ndarray` default @@ -571,7 +571,7 @@ def __delitem__(self, key: KeyType): Parameters ---------- - key : KeyType + key Name of the sample to remove from the column. """ with ExitStack() as stack: @@ -597,7 +597,7 @@ def pop(self, key: KeyType): Parameters ---------- - key : KeysType + key Sample key to remove Returns @@ -722,7 +722,7 @@ def __getitem__(self, key: KeyType) -> FlatSubsampleReader: Parameters ---------- - key : KeyType + key Name of sample to retrieve Returns @@ -851,7 +851,7 @@ def _mode_local_aware_key_looper(self, local: bool) -> Iterable[KeyType]: Parameters ---------- - local : bool + local True if keys should be returned which only exist on the local machine. False if remote sample keys should be excluded. @@ -920,7 +920,7 @@ def keys(self, local: bool = False) -> Iterable[KeyType]: Parameters ---------- - local : bool, optional + local If True, returned keys will only correspond to data which is available for reading on the local disk, by default False. @@ -936,7 +936,7 @@ def values(self, local: bool = False) -> Iterable[Any]: Parameters ---------- - local : bool, optional + local If True, returned values will only correspond to data which is available for reading on the local disk. No attempt will be made to read data existing on a remote server, by default False. @@ -954,7 +954,7 @@ def items(self, local: bool = False) -> Iterable[Tuple[KeyType, Any]]: Parameters ---------- - local : bool, optional + local If True, returned keys/values will only correspond to data which is available for reading on the local disk, No attempt will be made to read data existing on a remote server, by default False. @@ -972,9 +972,9 @@ def get(self, key: GetKeysType, default: Any = None) -> FlatSubsampleReader: Parameters ---------- - key : GetKeysType + key The name of the subsample(s) to retrieve - default : Any + default if a `key` parameter is not found, then return this value instead. By default, None. @@ -1129,7 +1129,7 @@ def pop(self, key: KeyType) -> Dict[KeyType, Any]: Parameters ---------- - key : KeysType + key sample key to remove Returns @@ -1154,7 +1154,7 @@ def change_backend(self, backend: str, backend_options: Optional[dict] = None): Parameters ---------- - backend : str + backend Backend format code to swtich to. backend_options Backend option specification to use (if specified). If left to diff --git a/src/hangar/dataloaders/tfloader.py b/src/hangar/dataloaders/tfloader.py index cb124da2..583b73f1 100644 --- a/src/hangar/dataloaders/tfloader.py +++ b/src/hangar/dataloaders/tfloader.py @@ -53,14 +53,14 @@ def make_tf_dataset(columns, columns : :class:`~hangar.columns.column.Columns` or Sequence A column object, a tuple of column object or a list of column objects` - keys : Sequence[str] + keys An iterable of sample names. If given only those samples will fetched from the column - index_range : slice + index_range A python slice object which will be used to find the subset of column. Argument `keys` takes priority over `index_range` i.e. if both are given, keys will be used and `index_range` will be ignored - shuffle : bool + shuffle generator uses this to decide a global shuffle accross all the samples is required or not. But user doesn't have any restriction on doing`column.shuffle()` on the returned column diff --git a/src/hangar/dataloaders/torchloader.py b/src/hangar/dataloaders/torchloader.py index 004dc0a2..dd61d301 100644 --- a/src/hangar/dataloaders/torchloader.py +++ b/src/hangar/dataloaders/torchloader.py @@ -36,14 +36,14 @@ def make_torch_dataset(columns, columns : :class:`~hangar.columns.column.Columns` or Sequence A column object, a tuple of column object or a list of column objects. - keys : Sequence[str] + keys An iterable collection of sample names. If given only those samples will fetched from the column - index_range : slice + index_range A python slice object which will be used to find the subset of column. Argument `keys` takes priority over `range` i.e. if both are given, keys will be used and `range` will be ignored - field_names : Sequence[str], optional + field_names An array of field names used as the `field_names` for the returned dict keys. If not given, column names will be used as the field_names. diff --git a/src/hangar/diff.py b/src/hangar/diff.py index b5556e71..8b9cbc4e 100644 --- a/src/hangar/diff.py +++ b/src/hangar/diff.py @@ -305,9 +305,9 @@ def _determine_ancestors(self, mHEAD: str, dHEAD: str) -> HistoryDiffStruct: Parameters ---------- - mHEAD : str + mHEAD full commit hash to use as the `master` branch head commit - dHEAD : str + dHEAD full commit hash to use as the `dev` branch head commit Returns @@ -340,11 +340,11 @@ def _diff3(a_env: lmdb.Environment, Parameters ---------- - a_env : lmdb.Environment + a_env unpacked lmdb environment for the ancestor commit - m_env : lmdb.Environment + m_env unpacked lmdb environment for the master commit, current HEAD - d_env : lmdb.Environment + d_env unpacked lmdb environment for the dev commit, compare to HEAD Returns @@ -368,9 +368,9 @@ def _diff(a_env: lmdb.Environment, m_env: lmdb.Environment) -> DiffAndConflictsD Parameters ---------- - a_env : lmdb.Environment + a_env unpacked lmdb environment for the ancestor commit - m_env : lmdb.Environment + m_env unpacked lmdb environment for the master commit Returns @@ -432,7 +432,7 @@ def _run_diff(self, dev_commit_hash: str) -> DiffAndConflictsDB: Parameters ---------- - dev_commit_hash : str + dev_commit_hash hash of the commit to be used as the comparison. Returns @@ -456,7 +456,7 @@ def commit(self, dev_commit_hash: str) -> DiffAndConflicts: Parameters ---------- - dev_commit_hash : str + dev_commit_hash hash of the commit to be used as the comparison. Returns @@ -483,7 +483,7 @@ def branch(self, dev_branch: str) -> DiffAndConflicts: Parameters ---------- - dev_branch : str + dev_branch name of the branch whose HEAD will be used to calculate the diff of. Returns @@ -561,7 +561,7 @@ def _run_diff(self, dev_commit_hash: str) -> DiffAndConflictsDB: Parameters ---------- - dev_commit_hash : str + dev_commit_hash hash of the commit to be used as the comparison. Returns @@ -585,7 +585,7 @@ def commit(self, dev_commit_hash: str) -> DiffAndConflicts: Parameters ---------- - dev_commit_hash : str + dev_commit_hash hash of the commit to be used as the comparison. Returns @@ -612,7 +612,7 @@ def branch(self, dev_branch: str) -> DiffAndConflicts: Parameters ---------- - dev_branch : str + dev_branch name of the branch whose HEAD will be used to calculate the diff of. Returns diff --git a/src/hangar/mixins/datasetget.py b/src/hangar/mixins/datasetget.py index 83ab5cdc..f0397b4b 100644 --- a/src/hangar/mixins/datasetget.py +++ b/src/hangar/mixins/datasetget.py @@ -128,12 +128,12 @@ def get(self, keys, default=None, except_missing=False): Please see detailed explanation in :meth:`__getitem__()` for full explanation of accepted argument format / result types. - default: Any, optional + default default value to insert in results for the case where some column name / sample key is not found, and the `except_missing` parameter is set to False. - except_missing: bool, optional + except_missing If False, will not throw exceptions on missing sample key value. Will raise KeyError if True and missing key found. @@ -163,12 +163,12 @@ def _get_in(self, keys, default=None, except_missing=False, Please see detailed explanation in :meth:`__getitem__()` for full explanation of accepted argument format / result types. - default: Any, optional + default default value to insert in results for the case where some column name / sample key is not found, and the `except_missing` parameter is set to False. - except_missing: bool, optional + except_missing If False, will not throw exceptions on missing sample key value. Will raise KeyError if True and missing key found. diff --git a/src/hangar/remotes.py b/src/hangar/remotes.py index 31a12c14..37517eaa 100644 --- a/src/hangar/remotes.py +++ b/src/hangar/remotes.py @@ -73,10 +73,10 @@ def add(self, name: str, address: str) -> RemoteInfo: Parameters ---------- - name : str + name the name which should be used to refer to the remote server (ie: 'origin') - address : str + address the IP:PORT where the hangar server is running Returns @@ -112,7 +112,7 @@ def remove(self, name: str) -> RemoteInfo: Parameters ---------- - name : str + name name of the remote to remove the reference to Raises @@ -154,7 +154,7 @@ def ping(self, name: str) -> float: Parameters ---------- - name : str + name name of the remote server to ping Returns @@ -190,9 +190,9 @@ def fetch(self, remote: str, branch: str) -> str: Parameters ---------- - remote : str + remote name of the remote repository to fetch from (ie. ``origin``) - branch : str + branch name of the branch to fetch the commit references for. Returns @@ -301,12 +301,11 @@ def fetch_data_sample(self, Parameters ---------- - remote : str + remote name of the remote server to pull data from - column : str + column name of the column which data is being fetched from. - samples : Union[KeyType, Sequence[KeyType], - Sequence[Union[Tuple[KeyType, KeyType], Tuple[KeyType], KeyType]]] + sample Key, or sequence of sample keys to select. * Flat column layouts should provide just a single key, or flat sequence of @@ -318,10 +317,10 @@ def fetch_data_sample(self, index `(sample, ...)` (which will fetch all subsamples for the given sample), or can provide lone sample keys in the sequences `sample` (which will also fetch all subsamples listed under the sample) OR ANY COMBINATION of the above. - branch : Optional[str] + branch branch head to operate on, either ``branch`` or ``commit`` argument must be passed, but NOT both. Default is ``None`` - commit : Optional[str] + commit commit to operate on, either `branch` or `commit` argument must be passed, but NOT both. @@ -495,24 +494,24 @@ def fetch_data(self, Parameters ---------- - remote : str + remote name of the remote to pull the data from - branch : str, optional + branch The name of a branch whose HEAD will be used as the data fetch point. If None, ``commit`` argument expected, by default None - commit : str, optional + commit Commit hash to retrieve data for, If None, ``branch`` argument expected, by default None - column_names : Optional[Sequence[str]] + column_names Names of the columns which should be retrieved for the particular commits, any columns not named will not have their data fetched from the server. Default behavior is to retrieve all columns - max_num_bytes : Optional[int] + max_num_bytes If you wish to limit the amount of data sent to the local machine, set a `max_num_bytes` parameter. This will retrieve only this amount of data from the server to be placed on the local disk. Default is to retrieve all data regardless of how large. - retrieve_all_history : Optional[bool] + retrieve_all_history if data should be retrieved for all history accessible by the parents of this commit HEAD. by default False @@ -625,8 +624,8 @@ def _form_missing_schema_digest_map( Parameters ---------- - selectedDataRecords : Set[queries.DataRecordVal] - hashenv : lmdb.Environment + selectedDataRecords + hashenv Returns ------- @@ -657,9 +656,9 @@ def _select_digest_fetch_data( Parameters ---------- - column_names : Union[None, Sequence[str]] + column_names column names to fetch data for. If ``None``, download all column data. - recQuery : queries.RecordQuery + recQuery initialized record query object set up with appropriate ``dataenv``. Returns @@ -695,15 +694,15 @@ def push(self, remote: str, branch: str, Parameters ---------- - remote : str + remote name of the remote repository to make the push on. - branch : str + branch Name of the branch to push to the remote. If the branch name does not exist on the remote, the it will be created - username : str, optional, kwarg-only + username credentials to use for authentication if repository push restrictions are enabled, by default ''. - password : str, optional, kwarg-only + password credentials to use for authentication if repository push restrictions are enabled, by default ''. diff --git a/src/hangar/repository.py b/src/hangar/repository.py index 3bbb8cd8..9ac9c98c 100644 --- a/src/hangar/repository.py +++ b/src/hangar/repository.py @@ -36,9 +36,9 @@ class Repository(object): Parameters ---------- - path : Union[str, os.PathLike] + path local directory path where the Hangar repository exists (or initialized) - exists : bool, optional + exists True if a Hangar repository should exist at the given directory path. Should no Hangar repository exists at that location, a UserWarning will be raised indicating that the :meth:`init` method needs to be called. @@ -238,13 +238,13 @@ def checkout(self, Parameters ---------- - write : bool, optional + write Specify if the checkout is write capable, defaults to False - branch : str, optional + branch name of the branch to checkout. This utilizes the state of the repo as it existed at the branch ``HEAD`` commit when this checkout object was instantiated, defaults to '' - commit : str, optional + commit specific hash of a commit to use for the checkout (instead of a branch ``HEAD`` commit). This argument takes precedent over a branch name parameter if it is set. Note: this only will be used in @@ -310,17 +310,17 @@ def clone(self, user_name: str, user_email: str, remote_address: str, Parameters ---------- - user_name : str + user_name Name of the person who will make commits to the repository. This information is recorded permanently in the commit records. - user_email : str + user_email Email address of the repository user. This information is recorded permanently in any commits created. - remote_address : str + remote_address location where the :class:`hangar.remote.server.HangarServer` process is running and accessible by the clone user. - remove_old : bool, optional, kwarg only + remove_old DANGER! DEVELOPMENT USE ONLY! If enabled, a :class:`hangar.repository.Repository` existing on disk at the same path as the requested clone location will be completely removed and @@ -356,11 +356,11 @@ def init(self, Parameters ---------- - user_name : str + user_name Name of the repository user account. - user_email : str + user_email Email address of the repository user account. - remove_old : bool, kwarg-only + remove_old DEVELOPER USE ONLY -- remove and reinitialize a Hangar repository at the given path, Default = False @@ -392,18 +392,18 @@ def log(self, Parameters ---------- - branch : str, optional + branch The name of the branch to start the log process from. (Default value = None) - commit : str, optional + commit The commit hash to start the log process from. (Default value = None) - return_contents : bool, optional, kwarg only + return_contents If true, return the commit graph specifications in a dictionary suitable for programatic access/evaluation. - show_time : bool, optional, kwarg only + show_time If true and return_contents is False, show the time of each commit on the printed log graph - show_user : bool, optional, kwarg only + show_user If true and return_contents is False, show the committer of each commit on the printed log graph Returns @@ -426,10 +426,10 @@ def summary(self, *, branch: str = '', commit: str = '') -> None: Parameters ---------- - branch : str, optional + branch A specific branch name whose head commit will be used as the summary point (Default value = '') - commit : str, optional + commit A specific commit hash which should be used as the summary point. (Default value = '') """ @@ -477,10 +477,10 @@ def diff(self, master: str, dev: str) -> DiffAndConflicts: Parameters ---------- - master: str + master branch name or commit hash digest to use as the "master" which changes made in "dev" are compared to. - dev: str + dev branch name or commit hash digest to use as the "dev" (ie. "feature") branch which changes have been made to which are to be compared to the contents of "master". @@ -528,11 +528,11 @@ def merge(self, message: str, master_branch: str, dev_branch: str) -> str: Parameters ---------- - message: str + message Commit message to use for this merge. - master_branch : str + master_branch name of the master branch to merge into - dev_branch : str + dev_branch name of the dev/feature branch to merge Returns @@ -581,9 +581,9 @@ def create_branch(self, name: str, base_commit: str = None) -> heads.BranchHead: Parameters ---------- - name : str + name name to assign to the new branch - base_commit : str, optional + base_commit commit hash to start the branch root at. if not specified, the writer branch ``HEAD`` commit at the time of execution will be used, defaults to None @@ -697,12 +697,12 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc Parameters ---------- - name : str + name name of the branch which should be deleted. This branch must exist, and cannot refer to a remote tracked branch (ie. origin/devbranch), please see exception descriptions for other parameters determining validity of argument - force_delete : bool, optional + force_delete If True, remove the branch pointer even if the changes are un-merged in other branch histories. May result in orphaned commits which may be time-consuming to recover if needed, by default False From f555fedd84ec7b11dab941bc49cb45b4a03b401e Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Mon, 10 Aug 2020 09:36:37 -0400 Subject: [PATCH 3/7] updates --- docs/api.md | 75 ------ docs/api/dataloaders.md | 12 + docs/api/reader_checkout.md | 24 ++ docs/api/repository.md | 14 ++ docs/api/writer_checkout.md | 34 +++ docs/backends/lmdb_31.md | 4 + docs/concepts.md | 326 +++++++++++++------------- docs/design.md | 163 +++++++------ mkdocs.yml | 23 +- src/hangar/backends/__init__.py | 68 +++--- src/hangar/backends/hdf5_00.py | 182 +++++++------- src/hangar/backends/hdf5_01.py | 274 +++++++++++----------- src/hangar/backends/lmdb_30.py | 83 ++++--- src/hangar/backends/lmdb_31.py | 85 ++++--- src/hangar/backends/numpy_10.py | 80 +++---- src/hangar/backends/remote_50.py | 47 ++-- src/hangar/bulk_importer.py | 140 +++++------ src/hangar/checkout.py | 178 +++++++------- src/hangar/columns/column.py | 2 +- src/hangar/columns/layout_flat.py | 10 +- src/hangar/columns/layout_nested.py | 16 +- src/hangar/dataloaders/tfloader.py | 2 +- src/hangar/dataloaders/torchloader.py | 4 +- src/hangar/diff.py | 29 +-- src/hangar/merger.py | 2 +- src/hangar/records/commiting.py | 4 +- src/hangar/records/summarize.py | 2 +- src/hangar/remotes.py | 86 ++++--- src/hangar/repository.py | 201 +++++++++------- 29 files changed, 1126 insertions(+), 1044 deletions(-) delete mode 100644 docs/api.md create mode 100644 docs/api/dataloaders.md create mode 100644 docs/api/reader_checkout.md create mode 100644 docs/api/repository.md create mode 100644 docs/api/writer_checkout.md create mode 100644 docs/backends/lmdb_31.md diff --git a/docs/api.md b/docs/api.md deleted file mode 100644 index cd6d5dc6..00000000 --- a/docs/api.md +++ /dev/null @@ -1,75 +0,0 @@ -Python API -========== - -This is the python API for the Hangar project. - -Repository ----------- - -![mkapi](hangar.repository.Repository) - -Remotes -------- - -![mkapi](hangar.remotes.Remotes) - -Write Enabled Checkout ----------------------- - -### Checkout - -![mkapi](hangar.checkout.WriterCheckout) - -### Columns - -![mkapi](hangar.columns.column.Columns) - -### Flat Column Layout Container - -![mkapi](hangar.columns.layout_flat.FlatSampleWriter) - -### Nested Column Layout Container - -![mkapi](hangar.columns.layout_nested.NestedSampleWriter) - -![mkapi](hangar.columns.layout_nested.FlatSubsampleWriter) - -### Differ - -![mkapi](hangar.diff.WriterUserDiff) - -### Bulk Importer - -![mkapi](hangar.bulk_importer.run_bulk_import) - -Read Only Checkout ------------------- - -### Checkout - -![mkapi](hangar.checkout.ReaderCheckout) - -### Flat Column Layout Container - -![mkapi](hangar.columns.layout_flat.FlatSampleReader) - -### Nested Column Layout Container - -![mkapi](hangar.columns.layout_nested.NestedSampleReader) - -![mkapi](hangar.columns.layout_nested.FlatSubsampleReader) - -### Differ - -![mkapi](hangar.diff.ReaderUserDiff) - -ML Framework Dataloaders ------------------------- - -### Tensorflow - -![mkapi](hangar.make_tf_dataset) - -### Pytorch - -![mkapi](hangar.make_torch_dataset) diff --git a/docs/api/dataloaders.md b/docs/api/dataloaders.md new file mode 100644 index 00000000..c30291b8 --- /dev/null +++ b/docs/api/dataloaders.md @@ -0,0 +1,12 @@ +ML Framework Dataloaders +======================== + +Tensorflow +---------- + +![mkapi](hangar.make_tf_dataset) + +Pytorch +------- + +![mkapi](hangar.make_torch_dataset) diff --git a/docs/api/reader_checkout.md b/docs/api/reader_checkout.md new file mode 100644 index 00000000..b89fd7be --- /dev/null +++ b/docs/api/reader_checkout.md @@ -0,0 +1,24 @@ +Read Only Checkout +================== + +Checkout +-------- + +![mkapi](hangar.checkout.ReaderCheckout) + +Flat Column Layout Container +---------------------------- + +![mkapi](hangar.columns.layout_flat.FlatSampleReader) + +Nested Column Layout Container +------------------------------ + +![mkapi](hangar.columns.layout_nested.NestedSampleReader) + +![mkapi](hangar.columns.layout_nested.FlatSubsampleReader) + +Differ +------ + +![mkapi](hangar.diff.ReaderUserDiff) diff --git a/docs/api/repository.md b/docs/api/repository.md new file mode 100644 index 00000000..ca54ab36 --- /dev/null +++ b/docs/api/repository.md @@ -0,0 +1,14 @@ +Python API +========== + +This is the python API for the Hangar project. + +Repository +---------- + +![mkapi](hangar.repository.Repository) + +Remotes +------- + +![mkapi](hangar.remotes.Remotes) diff --git a/docs/api/writer_checkout.md b/docs/api/writer_checkout.md new file mode 100644 index 00000000..97111006 --- /dev/null +++ b/docs/api/writer_checkout.md @@ -0,0 +1,34 @@ +Write Enabled Checkout +====================== + +Checkout +-------- + +![mkapi](hangar.checkout.WriterCheckout) + +Columns +------- + +![mkapi](hangar.columns.column.Columns) + +Flat Column Layout Container +---------------------------- + +![mkapi](hangar.columns.layout_flat.FlatSampleWriter) + +Nested Column Layout Container +------------------------------ + +![mkapi](hangar.columns.layout_nested.NestedSampleWriter) + +![mkapi](hangar.columns.layout_nested.FlatSubsampleWriter) + +Differ +------ + +![mkapi](hangar.diff.WriterUserDiff) + +Bulk Importer +------------- + +![mkapi](hangar.bulk_importer.run_bulk_import) diff --git a/docs/backends/lmdb_31.md b/docs/backends/lmdb_31.md new file mode 100644 index 00000000..975d20c8 --- /dev/null +++ b/docs/backends/lmdb_31.md @@ -0,0 +1,4 @@ +Variable Shape LMDB Bytes Data Store +==================================== + +![mkapi](hangar.backends.lmdb_31) diff --git a/docs/concepts.md b/docs/concepts.md index 19ba4851..ace1e0e7 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -20,16 +20,16 @@ At its core Hangar is designed to solve many of the same problems faced by traditional code version control system (ie. `Git`), just adapted for numerical data: -- Time travel through the historical evolution of a dataset -- Zero-cost Branching to enable exploratory analysis and collaboration -- Cheap Merging to build datasets over time (with multiple - collaborators) -- Completely abstracted organization and management of data files on - disk -- Ability to only retrieve a small portion of the data (as needed) - while still maintaining complete historical record -- Ability to push and pull changes directly to collaborators or a - central server (ie. a truly distributed version control system) +- Time travel through the historical evolution of a dataset +- Zero-cost Branching to enable exploratory analysis and collaboration +- Cheap Merging to build datasets over time (with multiple + collaborators) +- Completely abstracted organization and management of data files on + disk +- Ability to only retrieve a small portion of the data (as needed) + while still maintaining complete historical record +- Ability to push and pull changes directly to collaborators or a + central server (ie. a truly distributed version control system) The ability of version control systems to perform these tasks for codebases is largely taken for granted by almost every developer today; @@ -48,14 +48,14 @@ The design of Hangar was heavily influenced by the Hangar user, many of the fundamental building blocks and commands can be thought of as interchangeable: -- checkout -- commit -- branch -- merge -- diff -- push -- pull/fetch -- log +- checkout +- commit +- branch +- merge +- diff +- push +- pull/fetch +- log Emulating the high level the git syntax has allowed us to create a user experience which should be familiar in many ways to Hangar users; a goal @@ -66,19 +66,19 @@ There are, however, many fundamental differences in how humans/programs interpret and use text in source files vs. numerical data which raise many questions Hangar needs to uniquely solve: -- How do we connect some piece of "Data" with a meaning in the real - world? -- How do we diff and merge large collections of data samples? -- How can we resolve conflicts? -- How do we make data access (reading and writing) convenient for both - user-driven exploratory analyses and high performance production - systems operating without supervision? -- How can we enable people to work on huge datasets in a local (laptop - grade) development environment? +- How do we connect some piece of "Data" with a meaning in the real + world? +- How do we diff and merge large collections of data samples? +- How can we resolve conflicts? +- How do we make data access (reading and writing) convenient for both + user-driven exploratory analyses and high performance production + systems operating without supervision? +- How can we enable people to work on huge datasets in a local (laptop + grade) development environment? We will show how Hangar solves these questions in a high-level guide below. For a deep dive into the Hangar internals, we invite you to check -out the `ref-hangar-under-the-hood`{.interpreted-text role="ref"} page. +out the [Hangar Under the Hood](design.md) page. How Hangar Thinks About Data ---------------------------- @@ -121,30 +121,31 @@ phenomenon, a dataset may require multiple pieces of information, each of a particular format, for each instance/sample recorded in the dataset.* -> **For Example** -> -> a Hospital will typically have a *Dataset* containing all of the CT -> scans performed over some period of time. A single CT scan is an -> instance, a single sample; however, once many are grouped together -> they form a *Dataset*. To expand on this simple view we realize that -> each CT scan consists of hundreds of pieces of information: -> -> > - Some large `numeric array` (the image data). -> > - Some smaller `numeric tuples` (describing image spacing, -> > dimension scale, capture time, machine parameters, etc). -> > - Many pieces of `string` data (the patient name, doctor name, -> > scan type, results found, etc). - -When thinking about the group of CT scans in aggregate, we realize that -though a single scan contains many disparate pieces of information stuck -together, when thinking about the aggregation of every scan in the -group, most of (if not all) of the same information fields are -duplicated within each samples. - -*A single scan is a bunch of disparate information stuck together, many -of those put together makes a Dataset, but looking down from the top, we -identify pattern of common fields across all items. We call these -groupings of similar typed information:* **Columns**. + +!!! example + + A Hospital will typically have a *Dataset* containing all of the CT + scans performed over some period of time. A single CT scan is an + instance, a single sample; however, once many are grouped together + they form a *Dataset*. To expand on this simple view we realize that + each CT scan consists of hundreds of pieces of information: + + - Some large `numeric array` (the image data). + - Some smaller `numeric tuples` (describing image spacing, + dimension scale, capture time, machine parameters, etc). + - Many pieces of `string` data (the patient name, doctor name, + scan type, results found, etc). + + When thinking about the group of CT scans in aggregate (as described in + the example above), we realize that though a single scan contains many + disparate pieces of information stuck together, when thinking about the + aggregation of every scan in the group, most of (if not all) of the same + information fields are duplicated within each samples. + + *A single scan is a bunch of disparate information stuck together, many + of those put together makes a Dataset, but looking down from the top, we + identify pattern of common fields across all items. We call these + groupings of similar typed information:* **Columns**. ### Abstraction 2: What Makes up a Column? @@ -155,9 +156,9 @@ data needed to fully describe a single `sample` in a `Dataset` may consist of information spread across any number of `Columns`. To define a `Column` in Hangar, we only need to provide: -- a name -- a type -- a shape +- a name +- a type +- a shape The individual pieces of information (`Data`) which fully describe some phenomenon via an aggregate mapping access across any number of @@ -166,9 +167,9 @@ phenomenon via an aggregate mapping access across any number of above, all samples contained in a `Column` must be numeric arrays with each having: -1) Same data type (standard `numpy` data types are supported). -2) A shape with each dimension size <= the shape (`max shape`) set in - the `column` specification (more on this later). +1) Same data type (standard `numpy` data types are supported). +2) A shape with each dimension size <= the shape (`max shape`) set in + the `column` specification (more on this later). Additionally, samples in a `column` can either be named, or unnamed (depending on how you interpret what the information contained in the @@ -198,7 +199,7 @@ abstractions and utilities to operate on it. ### Summary -``` {.sourceCode .text} +``` A Dataset is thought of as containing Samples, but is actually defined by Columns, which store parts of fully defined Samples in structures common across the full aggregation of Dataset Samples. @@ -234,18 +235,17 @@ store or track the data set, just the underlying columns. The technical crowd among the readers should note: - - Hangar preserves all sample data bit-exactly. - - Dense arrays are fully supported, Sparse array support is - currently under development and will be released soon. - - Integrity checks are built in by default (explained in more detail - in `ref-hangar-under-the-hood`{.interpreted-text role="ref"}.) - using cryptographically secure algorithms. - - Hangar is very much a young project, until penetration tests and - security reviews are performed, we will refrain from stating that - Hangar is fully "cryptographically secure". Security experts are - welcome to contact us privately at [hangar.info@tensorwerk.com - ]{.title-ref}__ to disclose any - security issues. + - Hangar preserves all sample data bit-exactly. + - Dense arrays are fully supported, Sparse array support is + currently under development and will be released soon. + - Integrity checks are built in by default (explained in more detail + in `ref-hangar-under-the-hood`{.interpreted-text role="ref"}.) + using cryptographically secure algorithms. + - Hangar is very much a young project, until penetration tests and + security reviews are performed, we will refrain from stating that + Hangar is fully "cryptographically secure". Security experts are + welcome to contact us privately at + to disclose any security issues. Implications of the Hangar Data Philosophy @@ -295,13 +295,13 @@ High Performance Computing community over the past few decades. In a sense, the backend of Hangar serves two functions: -1) Bookkeeping: recording information about about columns, samples, - commits, etc. -2) Data Storage: highly optimized interfaces which store and retrieve - data from from disk through its backend utility. +1. Bookkeeping: recording information about about columns, samples, + commits, etc. +2. Data Storage: highly optimized interfaces which store and retrieve + data from from disk through its backend utility. The details are explained much more thoroughly in -`ref-hangar-under-the-hood`{.interpreted-text role="ref"}. +[Hangar Under the Hoold](design.md) Because Hangar only considers data to be numbers, the choice of backend to store data is (in a sense) completely arbitrary so long as @@ -323,10 +323,9 @@ configuration. At the time of writing, Hangar has the following backends implemented (with plans to potentially support more as needs arise): -1) [HDF5](https://www.hdfgroup.org/solutions/hdf5/) -2) [Memmapped - Arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.memmap.html) -3) [TileDb](https://tiledb.io/) (in development) +1. [HDF5](https://www.hdfgroup.org/solutions/hdf5/) +2. [Memmapped Arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.memmap.html) +3. [TileDb](https://tiledb.io/) (in development) ### Open Source Software Style Collaboration in Dataset Curation @@ -350,25 +349,28 @@ stay ahead of their competitors, and because this information is so difficult (and expensive) to generate, it's completely reasonable that they should be the ones to benefit from all that work. -> **A Thought Experiment** -> -> Imagine that `Git` and `GitHub` didn't take over the world. Imagine -> that the `Diff` and `Patch` Unix tools never existed. Instead, imagine -> we were to live in a world where every software project had very -> different version control systems (largely homeade by non VCS experts, -> & not validated by a community over many years of use). Even worse, -> most of these tools don't allow users to easily branch, make changes, -> and automatically merge them back. It shouldn't be difficult to -> imagine how dramatically such a world would contrast to ours today. -> Open source software as we know it would hardly exist, and any efforts -> would probably be massively fragmented across the web (if there would -> even be a 'web' that we would recognize in this strange world). -> -> Without a way to collaborate in the open, open source software would -> largely not exist, and we would all be worse off for it. -> -> Doesn't this hypothetical sound quite a bit like the state of open -> source data collaboration in todays world? + +!!! question + + **A Thought Experiment** + + Imagine that `Git` and `GitHub` didn't take over the world. Imagine + that the `Diff` and `Patch` Unix tools never existed. Instead, imagine + we were to live in a world where every software project had very + different version control systems (largely homeade by non VCS experts, + & not validated by a community over many years of use). Even worse, + most of these tools don't allow users to easily branch, make changes, + and automatically merge them back. It shouldn't be difficult to + imagine how dramatically such a world would contrast to ours today. + Open source software as we know it would hardly exist, and any efforts + would probably be massively fragmented across the web (if there would + even be a 'web' that we would recognize in this strange world). + + Without a way to collaborate in the open, open source software would + largely not exist, and we would all be worse off for it. + + Doesn't this hypothetical sound quite a bit like the state of open + source data collaboration in todays world? The impetus for developing a tool like Hangar is the belief that if it is simple for anyone with domain knowledge to collaboratively curate @@ -446,10 +448,12 @@ the future. !!! note - To try this out for yourself, please refer to the the API Docs - (:ref:`ref-api`) on working with Remotes, especially the ``fetch()`` and - ``fetch-data()`` methods. Otherwise look for through our tutorials & - examples for more practical info! + To try this out for yourself, please refer to the the + [API Docs](api/repository.md) on working with Remotes, especially + the [fetch()](../api/repository#hangar.remotes.Remotes.fetch) and + [fetch-data()](../api/repository#hangar.remotes.Remotes.fetch_data) + methods. Otherwise look for through our tutorials & examples for + more practical info! #### What Does it Mean to "Merge" Data? @@ -478,24 +482,26 @@ indicate that your samples are larger than they should be). To understand merge logic, we first need to understand diffing, and the actors operations which can occur. -Addition - - An operation which creates a column, sample, or some metadata which - did not previously exist in the relevant branch history. - -Removal - - An operation which removes some column, a sample, or some metadata - which existed in the parent of the commit under consideration. (Note: - removing a column also removes all samples contained in it). +!!! summary -Mutation - - An operation which sets: data to a sample, the value of some metadata - key, or a column schema, to a different value than what it had - previously been created with (Note: a column schema mutation is - observed when a column is removed, and a new column with the same name - is created with a different dtype/shape, all in the same commit). + - **Addition** + + An operation which creates a column, sample, or some metadata which + did not previously exist in the relevant branch history. + + - **Removal** + + An operation which removes some column, a sample, or some metadata + which existed in the parent of the commit under consideration. (Note: + removing a column also removes all samples contained in it). + + - **Mutation** + + An operation which sets: data to a sample, the value of some metadata + key, or a column schema, to a different value than what it had + previously been created with (Note: a column schema mutation is + observed when a column is removed, and a new column with the same name + is created with a different dtype/shape, all in the same commit). ##### Merging Changes @@ -511,18 +517,20 @@ as well. If these changes are identical, they are compatible, but what if they are not? In the following example, we diff and merge each element of the sample array like we would text: - Merge ?? - commit A commit B Does combining mean anything? - - [[0, 1, 2], [[0, 1, 2], [[1, 1, 1], - [0, 1, 2], -----> [2, 2, 2], ------------> [2, 2, 2], - [0, 1, 2]] [3, 3, 3]] / [3, 3, 3]] - \ / - \ commit C / - \ / - \ [[1, 1, 1], / - -------> [0, 1, 2], - [0, 1, 2]] +``` + Merge ?? + commit A commit B Does combining mean anything? + +[[0, 1, 2], [[0, 1, 2], [[1, 1, 1], + [0, 1, 2], -----> [2, 2, 2], ------------> [2, 2, 2], + [0, 1, 2]] [3, 3, 3]] / [3, 3, 3]] + \ / + \ commit C / + \ / + \ [[1, 1, 1], / + -------> [0, 1, 2], + [0, 1, 2]] +``` We see that a result can be generated, and can agree if this was a piece of text, the result would be correct. Don't be fooled, this is an @@ -537,17 +545,19 @@ authors. This is the actual behavior of Hangar. - commit A commit B - - [[0, 1, 2], [[0, 1, 2], - [0, 1, 2], -----> [2, 2, 2], ----- MERGE CONFLICT - [0, 1, 2]] [3, 3, 3]] / - \ / - \ commit C / - \ / - \ [[1, 1, 1], / - -------> [0, 1, 2], - [0, 1, 2]] +``` + commit A commit B + +[[0, 1, 2], [[0, 1, 2], + [0, 1, 2], -----> [2, 2, 2], ----- MERGE CONFLICT + [0, 1, 2]] [3, 3, 3]] / + \ / + \ commit C / + \ / + \ [[1, 1, 1], / + -------> [0, 1, 2], + [0, 1, 2]] +``` When a conflict is detected, the merge author must either pick a sample from one of the commits or make changes in one of the branches such that @@ -560,21 +570,21 @@ Any merge conflicts can be identified and addressed ahead of running a commits, Hangar will provide a list of conflicts which it identifies. In general these fall into 4 categories: -1) **Additions** in both branches which created new keys (samples / - columns / metadata) with non-compatible values. For samples & - metadata, the hash of the data is compared, for columns, the schema - specification is checked for compatibility in a method custom to the - internal workings of Hangar. -2) **Removal** in `Master Commit / Branch` **& Mutation** in - `Dev Commit / Branch`. Applies for samples, columns, and metadata - identically. -3) **Mutation** in `Dev Commit / Branch` **& Removal** in - `Master Commit / Branch`. Applies for samples, columns, and metadata - identically. -4) **Mutations** on keys both branches to non-compatible values. For - samples & metadata, the hash of the data is compared, for columns, - the schema specification is checked for compatibility in a method - custom to the internal workings of Hangar. +1. **Additions** in both branches which created new keys (samples / + columns / metadata) with non-compatible values. For samples & + metadata, the hash of the data is compared, for columns, the schema + specification is checked for compatibility in a method custom to the + internal workings of Hangar. +2. **Removal** in `Master Commit / Branch` **& Mutation** in + `Dev Commit / Branch`. Applies for samples, columns, and metadata + identically. +3. **Mutation** in `Dev Commit / Branch` **& Removal** in + `Master Commit / Branch`. Applies for samples, columns, and metadata + identically. +4. **Mutations** on keys both branches to non-compatible values. For + samples & metadata, the hash of the data is compared, for columns, + the schema specification is checked for compatibility in a method + custom to the internal workings of Hangar. What's Next? ------------- diff --git a/docs/design.md b/docs/design.md index 1464fa60..41678907 100644 --- a/docs/design.md +++ b/docs/design.md @@ -11,13 +11,13 @@ When designing a high performance data version control system, achieving performance goals while ensuring consistency is incredibly difficult. Memory is fast, disk is slow; not much we can do about it. But since Hangar should deal with any numeric data in an array of any size (with -an enforced limit of 31 dimensions in a sample\...) we have to find ways +an enforced limit of 31 dimensions in a sample...) we have to find ways to work *with* the disk, not against it. Upon coming to terms with this face, we are actually presented with a problem once we realize that we live in the real world, and real world -is ugly. Computers crash, processes get killed, and people do \* -*interesting* \* things. Because of this, It is a foundational design +is ugly. Computers crash, processes get killed, and people do * +*interesting* * things. Because of this, It is a foundational design principle for us to *guarantee that once Hangar says data has been successfully added to the repository, it is actually persisted.* This essentially means that any process which interacts with data records on @@ -38,7 +38,7 @@ data and records are committed to disk. The atomicity of interactions is completely hidden from a normal user; they shouldn't have to care about this or even know this exists. However, this is also why using the context-manager style column - interaction scheme can result in \~2x times speedup on writes/reads. We + interaction scheme can result in ~2x times speedup on writes/reads. We can just pass on most of the work to the Python `contextlib` package instead of having to begin and commit/abort (depending on interaction mode) transactions with every call to an [add]{.title-ref} or @@ -163,47 +163,50 @@ point in time. records", is semi inefficient, and will be changed in the future so that unchanged records are note duplicated across commits. -An example is given below of the keys -\> values mapping which stores +An example is given below of the keys -> values mapping which stores each of the staged records, and which are packed up / compressed on commit (and subsequently unpacked on checkout!). - Num asets 'a.' -> '2' - --------------------------------------------------------------------------- - Name of aset -> num samples || 'a.train_images' -> '10' - Name of data -> hash || 'a.train_images.0' -> BAR_HASH_1' - Name of data -> hash || 'a.train_images.1' -> BAR_HASH_2' - Name of data -> hash || 'a.train_images.2' -> BAR_HASH_3' - Name of data -> hash || 'a.train_images.3' -> BAR_HASH_4' - Name of data -> hash || 'a.train_images.4' -> BAR_HASH_5' - Name of data -> hash || 'a.train_images.5' -> BAR_HASH_6' - Name of data -> hash || 'a.train_images.6' -> BAR_HASH_7' - Name of data -> hash || 'a.train_images.7' -> BAR_HASH_8' - Name of data -> hash || 'a.train_images.8' -> BAR_HASH_9' - Name of data -> hash || 'a.train_images.9' -> BAR_HASH_0' - --------------------------------------------------------------------------- - Name of aset -> num samples || 'a.train_labels' -> '10' - Name of data -> hash || 'a.train_labels.0' -> BAR_HASH_11' - Name of data -> hash || 'a.train_labels.1' -> BAR_HASH_12' - Name of data -> hash || 'a.train_labels.2' -> BAR_HASH_13' - Name of data -> hash || 'a.train_labels.3' -> BAR_HASH_14' - Name of data -> hash || 'a.train_labels.4' -> BAR_HASH_15' - Name of data -> hash || 'a.train_labels.5' -> BAR_HASH_16' - Name of data -> hash || 'a.train_labels.6' -> BAR_HASH_17' - Name of data -> hash || 'a.train_labels.7' -> BAR_HASH_18' - Name of data -> hash || 'a.train_labels.8' -> BAR_HASH_19' - Name of data -> hash || 'a.train_labels.9' -> BAR_HASH_10' - --------------------------------------------------------------------------- - 's.train_images' -> '{"schema_hash": "RM4DefFsjRs=", - "schema_dtype": 2, - "schema_is_var": false, - "schema_max_shape": [784], - "schema_is_named": true}' - 's.train_labels' -> '{"schema_hash": - "ncbHqE6Xldg=", - "schema_dtype": 7, - "schema_is_var": false, - "schema_max_shape": [1], - "schema_is_named": true}' +``` + +Num asets 'a.' -> '2' +--------------------------------------------------------------------------- +Name of aset -> num samples || 'a.train_images' -> '10' +Name of data -> hash || 'a.train_images.0' -> BAR_HASH_1' +Name of data -> hash || 'a.train_images.1' -> BAR_HASH_2' +Name of data -> hash || 'a.train_images.2' -> BAR_HASH_3' +Name of data -> hash || 'a.train_images.3' -> BAR_HASH_4' +Name of data -> hash || 'a.train_images.4' -> BAR_HASH_5' +Name of data -> hash || 'a.train_images.5' -> BAR_HASH_6' +Name of data -> hash || 'a.train_images.6' -> BAR_HASH_7' +Name of data -> hash || 'a.train_images.7' -> BAR_HASH_8' +Name of data -> hash || 'a.train_images.8' -> BAR_HASH_9' +Name of data -> hash || 'a.train_images.9' -> BAR_HASH_0' +--------------------------------------------------------------------------- +Name of aset -> num samples || 'a.train_labels' -> '10' +Name of data -> hash || 'a.train_labels.0' -> BAR_HASH_11' +Name of data -> hash || 'a.train_labels.1' -> BAR_HASH_12' +Name of data -> hash || 'a.train_labels.2' -> BAR_HASH_13' +Name of data -> hash || 'a.train_labels.3' -> BAR_HASH_14' +Name of data -> hash || 'a.train_labels.4' -> BAR_HASH_15' +Name of data -> hash || 'a.train_labels.5' -> BAR_HASH_16' +Name of data -> hash || 'a.train_labels.6' -> BAR_HASH_17' +Name of data -> hash || 'a.train_labels.7' -> BAR_HASH_18' +Name of data -> hash || 'a.train_labels.8' -> BAR_HASH_19' +Name of data -> hash || 'a.train_labels.9' -> BAR_HASH_10' +--------------------------------------------------------------------------- +'s.train_images' -> '{"schema_hash": "RM4DefFsjRs=", + "schema_dtype": 2, + "schema_is_var": false, + "schema_max_shape": [784], + "schema_is_named": true}' +'s.train_labels' -> '{"schema_hash": + "ncbHqE6Xldg=", + "schema_dtype": 7, + "schema_is_var": false, + "schema_max_shape": [1], + "schema_is_named": true}' +``` ### History is Relative @@ -240,16 +243,18 @@ created with the name [master]{.title-ref}, and which is the only commit in the entire repository which will have no parent. The record key/value pairs resemble the following: - 'branch.master' -> '' # No parent commit. - 'head' -> 'branch.master' # Staging area head branch +``` +'branch.master' -> '' # No parent commit. +'head' -> 'branch.master' # Staging area head branch - # Commit Hash | Parent Commit - ------------------------------------- +# Commit Hash | Parent Commit +------------------------------------- +``` !!! warning - Much like git, odd things can happen before the ['initial - commit']{.title-ref} is made. We recommend creating the initial commit + Much like git, odd things can happen before the ``initial + commit`` is made. We recommend creating the initial commit as quickly as possible to prevent undefined behavior during repository setup. In the future, we may decide to create the "initial commit" automatically upon repository initialization. @@ -259,12 +264,14 @@ specifies the records (not shown below) and the parent commit. The branch head pointer is then updated to point to that commit as it's base. - 'branch.master' -> '479b4cfff6219e3d' - 'head' -> 'branch.master' +``` +'branch.master' -> '479b4cfff6219e3d' +'head' -> 'branch.master' - # Commit Hash | Parent Commit - ------------------------------------- - '479b4cfff6219e3d' -> '' +# Commit Hash | Parent Commit +------------------------------------- +'479b4cfff6219e3d' -> '' +``` Branches can be created as cheaply as a single line of text can be written, and they simply require a "root" commit hash (or a branch @@ -276,41 +283,41 @@ now). A more complex example which creates 4 different branches and merges them in a complicated order can be seen below. Please note that the `<<` -symbol is used to indicate a merge commit where [X \<\< Y]{.title-ref} +symbol is used to indicate a merge commit where ``X << Y`` reads: `'merging dev branch Y into master branch X'`. - 'branch.large_branch' -> '8eabd22a51c5818c' - 'branch.master' -> '2cd30b98d34f28f0' - 'branch.test_branch' -> '1241a36e89201f88' - 'branch.trydelete' -> '51bec9f355627596' - 'head' -> 'branch.master' - - # Commit Hash | Parent Commit - ------------------------------------- - '1241a36e89201f88' -> '8a6004f205fd7169' - '2cd30b98d34f28f0' -> '9ec29571d67fa95f << 51bec9f355627596' - '51bec9f355627596' -> 'd683cbeded0c8a89' - '69a09d87ea946f43' -> 'd683cbeded0c8a89' - '8a6004f205fd7169' -> 'a320ae935fc3b91b' - '8eabd22a51c5818c' -> 'c1d596ed78f95f8f' - '9ec29571d67fa95f' -> '69a09d87ea946f43 << 8eabd22a51c5818c' - 'a320ae935fc3b91b' -> 'e3e79dd897c3b120' - 'c1d596ed78f95f8f' -> '' - 'd683cbeded0c8a89' -> 'fe0bcc6a427d5950 << 1241a36e89201f88' - 'e3e79dd897c3b120' -> 'c1d596ed78f95f8f' - 'fe0bcc6a427d5950' -> 'e3e79dd897c3b120' +``` +'branch.large_branch' -> '8eabd22a51c5818c' +'branch.master' -> '2cd30b98d34f28f0' +'branch.test_branch' -> '1241a36e89201f88' +'branch.trydelete' -> '51bec9f355627596' +'head' -> 'branch.master' + + # Commit Hash | Parent Commit + ------------------------------------- +'1241a36e89201f88' -> '8a6004f205fd7169' +'2cd30b98d34f28f0' -> '9ec29571d67fa95f << 51bec9f355627596' +'51bec9f355627596' -> 'd683cbeded0c8a89' +'69a09d87ea946f43' -> 'd683cbeded0c8a89' +'8a6004f205fd7169' -> 'a320ae935fc3b91b' +'8eabd22a51c5818c' -> 'c1d596ed78f95f8f' +'9ec29571d67fa95f' -> '69a09d87ea946f43 << 8eabd22a51c5818c' +'a320ae935fc3b91b' -> 'e3e79dd897c3b120' +'c1d596ed78f95f8f' -> '' +'d683cbeded0c8a89' -> 'fe0bcc6a427d5950 << 1241a36e89201f88' +'e3e79dd897c3b120' -> 'c1d596ed78f95f8f' +'fe0bcc6a427d5950' -> 'e3e79dd897c3b120' +``` Because the raw commit hash logs can be quite dense to parse, a graphical logging utility is included as part of the repository. Running the `Repository.log()` method will pretty print a graph representation of the commit history: -``` {.sourceCode .python} +```python >>> from hangar import Repository >>> repo = Repository(path='/foo/bar/path/') - -... # make some commits - +# make some commits >>> repo.log() ``` diff --git a/mkdocs.yml b/mkdocs.yml index 60b3d162..b031c11f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,6 +31,8 @@ markdown_extensions: - admonition - pymdownx.details - pymdownx.superfences + - pymdownx.highlight: + use_pygments: true extra: social: @@ -48,14 +50,18 @@ nav: - Quickstart: quickstart.md - Installation: installation.md - Concepts: concepts.md - - API: api.md + - API: + - Repository & Remotes: api/repository.md + - Write-Enabled Checkout: api/writer_checkout.md + - Read-Only Checkout: api/reader_checkout.md + - Machine Learning Dataloaders: api/dataloaders.md - Tutorials: - - Tutorial-001.ipynb - - Tutorial-002.ipynb - - Tutorial-003.ipynb - - Tutorial-Dataloader.ipynb - - Tutorial-QuickStart.ipynb - - Tutorial-RealQuickStart.ipynb + - Quick Start Tutorial: Tutorial-QuickStart.ipynb + - Part 1 - Creating A Repository And Working With Data: Tutorial-001.ipynb + - Part 2 - Checkouts, Branching, & Merging: Tutorial-002.ipynb + - Part 3 - Working With Remote Servers: Tutorial-003.ipynb + - Dataloaders for Machine Learning (Tensorflow & PyTorch): Tutorial-Dataloader.ipynb + - Real World Quick Start Tutorial: Tutorial-RealQuickStart.ipynb - Design: design.md #- CLI: cli.md - Externals: externals.md @@ -66,10 +72,11 @@ nav: - HDF5_01: backends/hdf5_01.md - NUMPY_10: backends/numpy_10.md - LMDB_30: backends/lmdb_30.md + - LMDB_31: backends/lmdb_31.md - REMOTE_50: backends/remote_50.md - Contributing: Contributing: contributing.md - Codeofconduct: codeofconduct.md + Code Of Conduct: codeofconduct.md Benchmarking: benchmarking.md - Authors: authors.md - Changelog: changelog.md diff --git a/src/hangar/backends/__init__.py b/src/hangar/backends/__init__.py index 5b71e539..8e12eb35 100644 --- a/src/hangar/backends/__init__.py +++ b/src/hangar/backends/__init__.py @@ -11,29 +11,29 @@ to. Valid characters are the union of ``ascii_lowercase``, ``ascii_uppercase``, and ``ascii_digits``: -.. centered:: ``abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`` + ``abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`` Though stored as bytes in the backend, we use human readable characters (and not unprintable bytes) to aid in human tasks like developer database dumps and debugging. The characters making up the two digit code have the following symantic meanings: - * First Character (element 0) indicates the ``backend type`` used. +* First Character (element 0) indicates the ``backend type`` used. - * Second character (element 1) indicates the ``version`` of the backend type - which should be used to parse the specification & accesss data (more on - this later) +* Second character (element 1) indicates the ``version`` of the backend type + which should be used to parse the specification & accesss data (more on + this later) The number of codes possible (a 2-choice permutation with repetition) is: 3844 which we anticipate to be more then sufficient long into the future. As a convention, the range of values in which the first digit of the code falls into can be used to identify the storage medium location: - * Lowercase ``ascii_letters`` & digits ``[0, 1, 2, 3, 4]`` -> reserved for - backends handling data on the local disk. +* Lowercase ``ascii_letters`` & digits ``[0, 1, 2, 3, 4]`` -> reserved for + backends handling data on the local disk. - * Uppercase ``ascii_letters`` & digits ``[5, 6, 7, 8, 9]`` -> reserved for - backends referring to data residing on a remote server. +* Uppercase ``ascii_letters`` & digits ``[5, 6, 7, 8, 9]`` -> reserved for + backends referring to data residing on a remote server. This is not a hard and fast rule though, and can be changed in the future if the need arises. @@ -44,39 +44,39 @@ In order to maintain backwards compatibility across versions of Hangar into the future the following ruleset is specified and MUST BE HONORED: -* When a new backend is proposed, the contributor(s) provide the class with a - meaningful name (``HDF5``, ``NUMPY``, ``TILEDB``, etc) identifying the - backend to Hangar developers. The review team will provide: +* When a new backend is proposed, the contributor(s) provide the class with a + meaningful name (``HDF5``, ``NUMPY``, ``TILEDB``, etc) identifying the + backend to Hangar developers. The review team will provide: - - ``backend type`` code - - ``version`` code + - ``backend type`` code + - ``version`` code - which all records related to that implementation identify themselves with. In - addition, Externally facing classes / methods go by a canonical name which is - the concatenation of the ``meaningful name`` and assigned ``"format code"`` - ie. for ``backend name: 'NUMPY'`` assigned ``type code: '1'`` and ``version - code: '0'`` must start external method/class names with: ``NUMPY_10_foo`` + which all records related to that implementation identify themselves with. In + addition, Externally facing classes / methods go by a canonical name which is + the concatenation of the ``meaningful name`` and assigned ``"format code"`` + ie. for ``backend name: 'NUMPY'`` assigned ``type code: '1'`` and ``version + code: '0'`` must start external method/class names with: ``NUMPY_10_foo`` -* Once a new backend is accepted, the code assigned to it is PERMANENT & - UNCHANGING. The same code cannot be used in the future for other backends. +* Once a new backend is accepted, the code assigned to it is PERMANENT & + UNCHANGING. The same code cannot be used in the future for other backends. -* Each backend independently determines the information it needs to log/store - to uniquely identify and retrieve a sample stored by it. There is no standard - format, each is free to define whatever fields they find most convenient. - Unique encode/decode methods are defined in order to serialize this - information to bytes and then reconstruct the information later. These bytes - are what are passed in when a retrieval request is made, and returned when a - storage request for some piece of data is performed. +* Each backend independently determines the information it needs to log/store + to uniquely identify and retrieve a sample stored by it. There is no standard + format, each is free to define whatever fields they find most convenient. + Unique encode/decode methods are defined in order to serialize this + information to bytes and then reconstruct the information later. These bytes + are what are passed in when a retrieval request is made, and returned when a + storage request for some piece of data is performed. -* Once accepted, The record format specified (ie. the byte representation - described above) cannot be modified in any way. This must remain permanent! +* Once accepted, The record format specified (ie. the byte representation + described above) cannot be modified in any way. This must remain permanent! -* Backend (internal) methods can be updated, optimized, and/or changed at any - time so long as: +* Backend (internal) methods can be updated, optimized, and/or changed at any + time so long as: - * No changes to the record format specification are introduced + * No changes to the record format specification are introduced - * Data stored via any previous iteration of the backend's accessor methods + * Data stored via any previous iteration of the backend's accessor methods can be retrieved bitwise exactly by the "updated" version. Before proposing a new backend or making changes to this file, please consider diff --git a/src/hangar/backends/hdf5_00.py b/src/hangar/backends/hdf5_00.py index 55d79bd1..8f694bfe 100644 --- a/src/hangar/backends/hdf5_00.py +++ b/src/hangar/backends/hdf5_00.py @@ -3,103 +3,58 @@ Backend Identifiers =================== -* Backend: ``0`` -* Version: ``0`` -* Format Code: ``00`` -* Canonical Name: ``HDF5_00`` +* Backend: ``0`` +* Version: ``0`` +* Format Code: ``00`` +* Canonical Name: ``HDF5_00`` Storage Method ============== -* Data is written to specific subarray indexes inside an HDF5 "dataset" in a - single HDF5 File. +* Data is written to specific subarray indexes inside an HDF5 "dataset" in a + single HDF5 File. -* In each HDF5 File there are ``COLLECTION_COUNT`` "datasets" (named ``["0" : +* In each HDF5 File there are ``COLLECTION_COUNT`` "datasets" (named ``["0" : "{COLLECTION_COUNT}"]``). These are referred to as ``"dataset number"`` -* Each dataset is a zero-initialized array of: +* Each dataset is a zero-initialized array of: - * ``dtype: {schema_dtype}``; ie ``np.float32`` or ``np.uint8`` + * ``dtype: {schema_dtype}``; ie ``np.float32`` or ``np.uint8`` - * ``shape: (COLLECTION_SIZE, *{schema_shape.size})``; ie ``(500, 10)`` or + * ``shape: (COLLECTION_SIZE, *{schema_shape.size})``; ie ``(500, 10)`` or ``(500, 300)``. The first index in the dataset is referred to as a ``collection index``. See technical note below for detailed explanation on why the flatten operaiton is performed. -* Compression Filters, Chunking Configuration/Options are applied globally for - all ``datasets`` in a file at dataset creation time. +* Compression Filters, Chunking Configuration/Options are applied globally for + all ``datasets`` in a file at dataset creation time. -* On read and write of all samples the xxhash64_hexdigest is calculated for - the raw array bytes. This is to ensure that all data in == data out of the - hdf5 files. That way even if a file is manually edited (bypassing fletcher32 - filter check) we have a quick way to tell that things are not as they should - be. +* On read and write of all samples the xxhash64_hexdigest is calculated for + the raw array bytes. This is to ensure that all data in == data out of the + hdf5 files. That way even if a file is manually edited (bypassing fletcher32 + filter check) we have a quick way to tell that things are not as they should + be. -Compression Options -=================== - -Accepts dictionary containing keys - -* ``backend`` == ``"00"`` -* ``complib`` -* ``complevel`` -* ``shuffle`` - -Blosc-HDF5 - -* ``complib`` valid values: - - * ``'blosc:blosclz'``, - * ``'blosc:lz4'``, - * ``'blosc:lz4hc'``, - * ``'blosc:zlib'``, - * ``'blosc:zstd'`` - -* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is - "most compression" - -* ``shuffle`` valid values: - - * ``None`` - * ``'none'`` - * ``'byte'`` - * ``'bit'`` - - -LZF Filter - -* ``'complib' == 'lzf'`` -* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` -* ``'complevel'`` one of ``[False, None, 'none']`` - -GZip Filter - -* ``'complib' == 'gzip'`` -* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` -* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is - "most compression" - - -Technical Details ------------------ -- Files are read only after initial creation/writes. Only a write-enabled +Technical Notes +--------------- +* Files are read only after initial creation/writes. Only a write-enabled checkout can open a HDF5 file in ``"w"`` or ``"a"`` mode, and writer checkouts create new files on every checkout, and make no attempt to fill in unset locations in previous files. This is not an issue as no disk space is used until data is written to the initially created "zero-initialized" collection datasets -- On write: Single Writer Multiple Reader (``SWMR``) mode is set to ensure that +* On write: Single Writer Multiple Reader (``SWMR``) mode is set to ensure that improper closing (not calling ``.close()``) method does not corrupt any data which had been previously flushed to the file. -- On read: SWMR is set to allow multiple readers (in different threads / +* On read: SWMR is set to allow multiple readers (in different threads / processes) to read from the same file. File handle serialization is handled via custom python ``pickle`` serialization/reduction logic which is implemented by the high level ``pickle`` reduction ``__set_state__()``, ``__get_state__()`` class methods. -- An optimization is performed in order to increase the read / write +* An optimization is performed in order to increase the read / write performance of variable shaped datasets. Due to the way that we initialize an entire HDF5 file with all datasets pre-created (to the size of the max subarray shape), we need to ensure that storing smaller sized arrays (in a @@ -120,38 +75,84 @@ operation). This is part of the reason that we only accept C ordered arrays as input to Hangar. + +Compression Options +=================== + +Accepts dictionary containing keys + +* ``backend`` == ``"00"`` +* ``complib`` +* ``complevel`` +* ``shuffle`` + +Blosc-HDF5 + +* ``complib`` valid values: + + * ``'blosc:blosclz'``, + * ``'blosc:lz4'``, + * ``'blosc:lz4hc'``, + * ``'blosc:zlib'``, + * ``'blosc:zstd'`` + +* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is + "most compression" + +* ``shuffle`` valid values: + + * ``None`` + * ``'none'`` + * ``'byte'`` + * ``'bit'`` + + +LZF Filter + +* ``'complib' == 'lzf'`` +* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` +* ``'complevel'`` one of ``[False, None, 'none']`` + +GZip Filter + +* ``'complib' == 'gzip'`` +* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` +* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is + "most compression" + + Record Format ============= Fields Recorded for Each Array ------------------------------ -* Format Code -* File UID -* xxhash64_hexdigest (ie. checksum) -* Dataset Number (``0:COLLECTION_COUNT`` dataset selection) -* Dataset Index (``0:COLLECTION_SIZE`` dataset subarray selection) -* Subarray Shape +* Format Code +* File UID +* xxhash64_hexdigest (ie. checksum) +* Dataset Number (``0:COLLECTION_COUNT`` dataset selection) +* Dataset Index (``0:COLLECTION_SIZE`` dataset subarray selection) +* Subarray Shape Examples -------- 1) Adding the first piece of data to a file: - * Array shape (Subarray Shape): (10, 10) - * File UID: "rlUK3C" - * xxhash64_hexdigest: 8067007c0f05c359 - * Dataset Number: 16 - * Collection Index: 105 + * Array shape (Subarray Shape): (10, 10) + * File UID: "rlUK3C" + * xxhash64_hexdigest: 8067007c0f05c359 + * Dataset Number: 16 + * Collection Index: 105 ``Record Data => "00:rlUK3C:8067007c0f05c359:16:105:10 10"`` 2) Adding to a piece of data to a the middle of a file: - * Array shape (Subarray Shape): (20, 2, 3) - * File UID: "rlUK3C" - * xxhash64_hexdigest: b89f873d3d153a9c - * Dataset Number: "3" - * Collection Index: 199 + * Array shape (Subarray Shape): (20, 2, 3) + * File UID: "rlUK3C" + * xxhash64_hexdigest: b89f873d3d153a9c + * Dataset Number: "3" + * Collection Index: 199 ``Record Data => "00:rlUK3C:b89f873d3d153a9c:8:199:20 2 3"`` @@ -522,14 +523,13 @@ def delete_in_process_data(repo_path: Path, *, remote_operation=False) -> None: def _dataset_opts(complib: str, complevel: int, shuffle: Union[bool, str]) -> dict: """specify compression options for the hdf5 dataset. - .. seealso:: :function:`_blosc_opts` - - to enable blosc compression, use the conda-forge `blosc-hdf5-plugin` package. + To enable blosc compression, use the conda-forge `blosc-hdf5-plugin` package. - .. seealso:: + !!! seealso - * https://github.com/conda-forge/staged-recipes/pull/7650 - * https://github.com/h5py/h5py/issues/611 + * :function:`_blosc_opts` + * https://github.com/conda-forge/staged-recipes/pull/7650 + * https://github.com/h5py/h5py/issues/611 Parameters ---------- @@ -648,7 +648,7 @@ def _create_schema(self, *, remote_operation: bool = False): maximum performance, this value should be set approximately 100 times that number of chunks. - .. seealso:: + !!! seealso http://docs.h5py.org/en/stable/high/file.html#chunk-cache diff --git a/src/hangar/backends/hdf5_01.py b/src/hangar/backends/hdf5_01.py index a133042c..182ca585 100644 --- a/src/hangar/backends/hdf5_01.py +++ b/src/hangar/backends/hdf5_01.py @@ -3,158 +3,76 @@ Backend Identifiers =================== -* Backend: ``0`` -* Version: ``1`` -* Format Code: ``01`` -* Canonical Name: ``HDF5_01`` +* Backend: ``0`` +* Version: ``1`` +* Format Code: ``01`` +* Canonical Name: ``HDF5_01`` Storage Method ============== -* This module is meant to handle larger datasets which are of fixed size. IO +* This module is meant to handle larger datasets which are of fixed size. IO and significant compression optimization is achieved by storing arrays at their appropriate top level index in the same shape they naturally assume and chunking over the entire subarray domain making up a sample (rather than having to subdivide chunks when the sample could be variably shaped.) -* Data is written to specific subarray indexes inside an HDF5 "dataset" in a +* Data is written to specific subarray indexes inside an HDF5 "dataset" in a single HDF5 File. -* In each HDF5 File there are ``COLLECTION_COUNT`` "datasets" (named ``["0" : +* In each HDF5 File there are ``COLLECTION_COUNT`` "datasets" (named ``["0" : "{COLLECTION_COUNT}"]``). These are referred to as ``"dataset number"`` -* Each dataset is a zero-initialized array of: +* Each dataset is a zero-initialized array of: - * ``dtype: {schema_dtype}``; ie ``np.float32`` or ``np.uint8`` + * ``dtype: {schema_dtype}``; ie ``np.float32`` or ``np.uint8`` - * ``shape: (COLLECTION_SIZE, *{schema_shape})``; ie ``(500, 10, 10)`` or + * ``shape: (COLLECTION_SIZE, *{schema_shape})``; ie ``(500, 10, 10)`` or ``(500, 512, 512, 320)``. The first index in the dataset is referred to as a ``collection index``. -* Compression Filters, Chunking Configuration/Options are applied globally for +* Compression Filters, Chunking Configuration/Options are applied globally for all ``datasets`` in a file at dataset creation time. -* On read and write of all samples the xxhash64_hexdigest is calculated for +* On read and write of all samples the xxhash64_hexdigest is calculated for the raw array bytes. This is to ensure that all data in == data out of the hdf5 files. That way even if a file is manually edited (bypassing fletcher32 filter check) we have a quick way to tell that things are not as they should be. -Compression Options -=================== - -Accepts dictionary containing keys - -* ``backend`` == ``"01"`` -* ``complib`` -* ``complevel`` -* ``shuffle`` - -Blosc-HDF5 - -* ``complib`` valid values: - - * ``'blosc:blosclz'``, - * ``'blosc:lz4'``, - * ``'blosc:lz4hc'``, - * ``'blosc:zlib'``, - * ``'blosc:zstd'`` - -* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is - "most compression" - -* ``shuffle`` valid values: - - * ``None`` - * ``'none'`` - * ``'byte'`` - * ``'bit'`` - - -LZF Filter - -* ``'complib' == 'lzf'`` -* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` -* ``'complevel'`` one of ``[False, None, 'none']`` - -GZip Filter - -* ``'complib' == 'gzip'`` -* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` -* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is - "most compression" - - -Record Format -============= - -Fields Recorded for Each Array ------------------------------- - -* Format Code -* File UID -* xxhash64_hexdigest (ie. checksum) -* Dataset Number (``0:COLLECTION_COUNT`` dataset selection) -* Dataset Index (``0:COLLECTION_SIZE`` dataset subarray selection) -* Subarray Shape - -Examples --------- - -1) Adding the first piece of data to a file: - - * Array shape (Subarray Shape): (10, 10) - * File UID: "rlUK3C" - * xxhash64_hexdigest: 8067007c0f05c359 - * Dataset Number: 16 - * Collection Index: 105 - - ``Record Data => "01:rlUK3C:8067007c0f05c359:16:105:10 10"`` - -1) Adding to a piece of data to a the middle of a file: - - * Array shape (Subarray Shape): (20, 2, 3) - * File UID: "rlUK3C" - * xxhash64_hexdigest: b89f873d3d153a9c - * Dataset Number: "3" - * Collection Index: 199 - - ``Record Data => "01:rlUK3C:b89f873d3d153a9c:8:199:20 2 3"`` - - Technical Notes -=============== - -* The majority of methods not directly related to "chunking" and the "raw data - chunk cache" are either identical to HDF5_00, or only slightly modified. - -* Files are read only after initial creation/writes. Only a write-enabled - checkout can open a HDF5 file in ``"w"`` or ``"a"`` mode, and writer - checkouts create new files on every checkout, and make no attempt to fill in - unset locations in previous files. This is not an issue as no disk space is - used until data is written to the initially created "zero-initialized" - collection datasets - -* On write: Single Writer Multiple Reader (``SWMR``) mode is set to ensure that - improper closing (not calling ``.close()``) method does not corrupt any data - which had been previously flushed to the file. - -* On read: SWMR is set to allow multiple readers (in different threads / - processes) to read from the same file. File handle serialization is handled - via custom python ``pickle`` serialization/reduction logic which is - implemented by the high level ``pickle`` reduction ``__set_state__()``, - ``__get_state__()`` class methods. - -* An optimization is performed in order to increase the read / write - performance of fixed size datasets. Due to the way that we initialize an - entire HDF5 file with all datasets pre-created (to the size of the fixed - subarray shape), and the fact we absolutely know the size / shape / - access-pattern of the arrays, inefficient IO due to wasted chunk processing - is not a concern. It is far more efficient for us to completely blow off the - metadata chunk cache, and chunk each subarray as a single large item item. - - This method of processing tends to have a number of significant effects as - compared to chunked storage methods: +--------------- + +* The majority of methods not directly related to "chunking" and the "raw data + chunk cache" are either identical to HDF5_00, or only slightly modified. + +* Files are read only after initial creation/writes. Only a write-enabled + checkout can open a HDF5 file in ``"w"`` or ``"a"`` mode, and writer + checkouts create new files on every checkout, and make no attempt to fill in + unset locations in previous files. This is not an issue as no disk space is + used until data is written to the initially created "zero-initialized" + collection datasets + +* On write: Single Writer Multiple Reader (``SWMR``) mode is set to ensure that + improper closing (not calling ``.close()``) method does not corrupt any data + which had been previously flushed to the file. + +* On read: SWMR is set to allow multiple readers (in different threads / + processes) to read from the same file. File handle serialization is handled + via custom python ``pickle`` serialization/reduction logic which is + implemented by the high level ``pickle`` reduction ``__set_state__()``, + ``__get_state__()`` class methods. + +* An optimization is performed in order to increase the read / write + performance of fixed size datasets. Due to the way that we initialize an + entire HDF5 file with all datasets pre-created (to the size of the fixed + subarray shape), and the fact we absolutely know the size / shape / + access-pattern of the arrays, inefficient IO due to wasted chunk processing + is not a concern. It is far more efficient for us to completely blow off the + metadata chunk cache, and chunk each subarray as a single large item item. + + This method of processing tends to have a number of significant effects as + compared to chunked storage methods: 1. **Compression rations improve** (by a non-trivial factor). This is simply due to the fact that a larger amount of raw data is being passed @@ -173,7 +91,7 @@ the numeric array, completly decoupling performance from HDF5's ability to parallelize internal filter pipeline operations. - Additionally, since the entire requested chunk is retrieved in a + Additionally, since the entire requested chunk is retrieved in a single decompression pipeline run, there is no need for the HDF5 core to initialize an intermediate buffer which holds data chunks as each decompression operation completes. Futher, by preinitializing an empty @@ -190,12 +108,91 @@ reducing the time spent waiting on hard disk IO while incuring a negligible cost to decompression speed. - Taking all of these effects into account, there can be up to an order of - magnitude increase in read performance as compared to the subarray chunking - strategy employed by the ``HDF5_00`` backend. + Taking all of these effects into account, there can be up to an order of + magnitude increase in read performance as compared to the subarray chunking + strategy employed by the ``HDF5_00`` backend. + +* Like all other backends at the time of writing, only 'C' ordered arrays + are accepted by this method. + +Compression Options +=================== + +Accepts dictionary containing keys + +* ``backend`` == ``"01"`` +* ``complib`` +* ``complevel`` +* ``shuffle`` + +Blosc-HDF5 + +* ``complib`` valid values: + + * ``'blosc:blosclz'``, + * ``'blosc:lz4'``, + * ``'blosc:lz4hc'``, + * ``'blosc:zlib'``, + * ``'blosc:zstd'`` + +* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is + "most compression" + +* ``shuffle`` valid values: -* Like all other backends at the time of writing, only 'C' ordered arrays - are accepted by this method. + * ``None`` + * ``'none'`` + * ``'byte'`` + * ``'bit'`` + +LZF Filter + +* ``'complib' == 'lzf'`` +* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` +* ``'complevel'`` one of ``[False, None, 'none']`` + +GZip Filter + +* ``'complib' == 'gzip'`` +* ``'shuffle'`` one of ``[False, None, 'none', True, 'byte']`` +* ``complevel`` valid values: [0, 9] where 0 is "no compression" and 9 is + "most compression" + + +Record Format +============= + +Fields Recorded for Each Array +------------------------------ + +* Format Code +* File UID +* xxhash64_hexdigest (ie. checksum) +* Dataset Number (``0:COLLECTION_COUNT`` dataset selection) +* Dataset Index (``0:COLLECTION_SIZE`` dataset subarray selection) +* Subarray Shape + +Examples +-------- +1) Adding the first piece of data to a file: + + * Array shape (Subarray Shape): (10, 10) + * File UID: "rlUK3C" + * xxhash64_hexdigest: 8067007c0f05c359 + * Dataset Number: 16 + * Collection Index: 105 + + ``Record Data => "01:rlUK3C:8067007c0f05c359:16:105:10 10"`` + +2) Adding to a piece of data to a the middle of a file: + + * Array shape (Subarray Shape): (20, 2, 3) + * File UID: "rlUK3C" + * xxhash64_hexdigest: b89f873d3d153a9c + * Dataset Number: "3" + * Collection Index: 199 + + ``Record Data => "01:rlUK3C:b89f873d3d153a9c:8:199:20 2 3"`` """ import logging import math @@ -562,14 +559,13 @@ def delete_in_process_data(repo_path: Path, *, remote_operation=False) -> None: def _dataset_opts(complib: str, complevel: int, shuffle: Union[bool, str]) -> dict: """specify compression options for the hdf5 dataset. - .. seealso:: :function:`_blosc_opts` - - to enable blosc compression, use the conda-forge `blosc-hdf5-plugin` package. + To enable blosc compression, use the conda-forge `blosc-hdf5-plugin` package. - .. seealso:: + !!! seealso - * https://github.com/conda-forge/staged-recipes/pull/7650 - * https://github.com/h5py/h5py/issues/611 + * :function:`_blosc_opts` + * https://github.com/conda-forge/staged-recipes/pull/7650 + * https://github.com/h5py/h5py/issues/611 Parameters ---------- @@ -657,7 +653,7 @@ def _create_schema(self, *, remote_operation: bool = False): maximum performance, this value should be set approximately 100 times that number of chunks. - .. seealso:: + !!! seealso http://docs.h5py.org/en/stable/high/file.html#chunk-cache diff --git a/src/hangar/backends/lmdb_30.py b/src/hangar/backends/lmdb_30.py index 851b30b5..9f2a0101 100644 --- a/src/hangar/backends/lmdb_30.py +++ b/src/hangar/backends/lmdb_30.py @@ -3,41 +3,41 @@ Backend Identifiers =================== -* Backend: ``3`` -* Version: ``0`` -* Format Code: ``30`` -* Canonical Name: ``LMDB_30`` +* Backend: ``3`` +* Version: ``0`` +* Format Code: ``30`` +* Canonical Name: ``LMDB_30`` Storage Method ============== -* This module is meant to handle string typed data which is of any size. IO - is performed via the LMDB storage system. +* This module is meant to handle string typed data which is of any size. IO + is performed via the LMDB storage system. -* This module does not compress values upon writing, the full (uncompressed) - value of the text is written to the DB for each key. +* This module does not compress values upon writing, the full (uncompressed) + value of the text is written to the DB for each key. -* For each LMDB file generated, data is indexed by keys which are generated - in lexicographically sorted order of key length 4. Keys consist of 4 characters - chosen from an alphabet consisting of ASCII digits, lowercase letters, and - upercase letters. Within a single write instance (when an LMDB file is created - and written to), lexicographically sorted permutations of the chosen characters - are used as key indexes. +* For each LMDB file generated, data is indexed by keys which are generated + in lexicographically sorted order of key length 4. Keys consist of 4 characters + chosen from an alphabet consisting of ASCII digits, lowercase letters, and + upercase letters. Within a single write instance (when an LMDB file is created + and written to), lexicographically sorted permutations of the chosen characters + are used as key indexes. - This means that for each LMDB file written in a repo, the sequence of generated - index keys will be identical, even though two databases with the same key will - store different values. As such, the File UID is crucial in order to identify - a unique db/index key combo to access a particular value by. + This means that for each LMDB file written in a repo, the sequence of generated + index keys will be identical, even though two databases with the same key will + store different values. As such, the File UID is crucial in order to identify + a unique db/index key combo to access a particular value by. -* There is no limit to the size which each record can occupy. Data is stored - "as-is" and is uncompressed. Reading the data back will return the exact - data stored (regardless of how large the data record is). +* There is no limit to the size which each record can occupy. Data is stored + "as-is" and is uncompressed. Reading the data back will return the exact + data stored (regardless of how large the data record is). -* On read and write of all samples the xxhash64_hexdigest is calculated for - the raw data bytes. This is to ensure that all data in == data out of the - lmdb files. That way even if a file is manually edited we have a quick way - to tell that things are not as they should be. (full data hash digests may - not be calculated every time a read is performed). +* On read and write of all samples the xxhash64_hexdigest is calculated for + the raw data bytes. This is to ensure that all data in == data out of the + lmdb files. That way even if a file is manually edited we have a quick way + to tell that things are not as they should be. (full data hash digests may + not be calculated every time a read is performed). Compression Options =================== @@ -50,34 +50,33 @@ Fields Recorded for Each Array ------------------------------ -* Format Code -* File UID -* Row Index +* Format Code +* File UID +* Row Index Examples -------- +1) Adding the first piece of data to a file: -1) Adding the first piece of data to a file: - - * File UID: "rlUK3C" - * Row Index: "0123" - * xxhash64_hexdigest: 8067007c0f05c359 + * File UID: "rlUK3C" + * Row Index: "0123" + * xxhash64_hexdigest: 8067007c0f05c359 ``Record Data => "30:rlUK3C:0123:8067007c0f05c359"`` -2) Adding a second piece of data: +2) Adding a second piece of data: - * File UID: "rlUK3C" - * Row Index: "0124" - * xxhash64_hexdigest: b89f873d3d153a9c + * File UID: "rlUK3C" + * Row Index: "0124" + * xxhash64_hexdigest: b89f873d3d153a9c ``Record Data => "30:rlUK3C:0124:b89f873d3d153a9c"`` -3) Adding a the 500th piece of data: +3) Adding a the 500th piece of data: - * File UID: "rlUK3C" - * Row Index: "01AU" - * xxhash64_hexdigest: cf3fc53cad153a5a + * File UID: "rlUK3C" + * Row Index: "01AU" + * xxhash64_hexdigest: cf3fc53cad153a5a ``Record Data => "30:rlUK3C:01AU:cf3fc53cad153a5a"`` """ diff --git a/src/hangar/backends/lmdb_31.py b/src/hangar/backends/lmdb_31.py index 7a24e7f1..a9cb3bb7 100644 --- a/src/hangar/backends/lmdb_31.py +++ b/src/hangar/backends/lmdb_31.py @@ -1,43 +1,43 @@ -"""Local LMDB Backend Implementation, Identifier: ``LMDB_30`` +"""Local LMDB Backend Implementation, Identifier: ``LMDB_31`` Backend Identifiers =================== -* Backend: ``3`` -* Version: ``1`` -* Format Code: ``31`` -* Canonical Name: ``LMDB_31`` +* Backend: ``3`` +* Version: ``1`` +* Format Code: ``31`` +* Canonical Name: ``LMDB_31`` Storage Method ============== -* This module is meant to handle bbytes typed data which is of any size. - less than 2MB per value. IO is performed via the LMDB storage system. +* This module is meant to handle bytes typed data which is of any size. + less than 2MB per value. IO is performed via the LMDB storage system. -* This module does not compress values upon writing, the full (uncompressed) - value of the text is written to the DB for each key. +* This module does not compress values upon writing, the full (uncompressed) + value of the text is written to the DB for each key. -* For each LMDB file generated, data is indexed by keys which are generated - in lexicographically sorted order of key length 4. Keys consist of 4 characters - chosen from an alphabet consisting of ASCII digits, lowercase letters, and - upercase letters. Within a single write instance (when an LMDB file is created - and written to), lexicographically sorted permutations of the chosen characters - are used as key indexes. +* For each LMDB file generated, data is indexed by keys which are generated + in lexicographically sorted order of key length 4. Keys consist of 4 characters + chosen from an alphabet consisting of ASCII digits, lowercase letters, and + upercase letters. Within a single write instance (when an LMDB file is created + and written to), lexicographically sorted permutations of the chosen characters + are used as key indexes. - This means that for each LMDB file written in a repo, the sequence of generated - index keys will be identical, even though two databases with the same key will - store different values. As such, the File UID is crucial in order to identify - a unique db/index key combo to access a particular value by. + This means that for each LMDB file written in a repo, the sequence of generated + index keys will be identical, even though two databases with the same key will + store different values. As such, the File UID is crucial in order to identify + a unique db/index key combo to access a particular value by. -* There is no limit to the size which each record can occupy. Data is stored - "as-is" and is uncompressed. Reading the data back will return the exact - data stored (regardless of how large the data record is). +* There is no limit to the size which each record can occupy. Data is stored + "as-is" and is uncompressed. Reading the data back will return the exact + data stored (regardless of how large the data record is). -* On read and write of all samples the xxhash64_hexdigest is calculated for - the raw data bytes. This is to ensure that all data in == data out of the - lmdb files. That way even if a file is manually edited we have a quick way - to tell that things are not as they should be. (full data hash digests may - not be calculated every time a read is performed). +* On read and write of all samples the xxhash64_hexdigest is calculated for + the raw data bytes. This is to ensure that all data in == data out of the + lmdb files. That way even if a file is manually edited we have a quick way + to tell that things are not as they should be. (full data hash digests may + not be calculated every time a read is performed). Compression Options =================== @@ -50,34 +50,33 @@ Fields Recorded for Each Array ------------------------------ -* Format Code -* File UID -* Row Index +* Format Code +* File UID +* Row Index Examples -------- +1) Adding the first piece of data to a file: -1) Adding the first piece of data to a file: - - * File UID: "rlUK3C" - * Row Index: "0123" - * xxhash64_hexdigest: 8067007c0f05c359 + * File UID: "rlUK3C" + * Row Index: "0123" + * xxhash64_hexdigest: 8067007c0f05c359 ``Record Data => "31:rlUK3C:0123:8067007c0f05c359"`` -2) Adding a second piece of data: +2) Adding a second piece of data: - * File UID: "rlUK3C" - * Row Index: "0124" - * xxhash64_hexdigest: b89f873d3d153a9c + * File UID: "rlUK3C" + * Row Index: "0124" + * xxhash64_hexdigest: b89f873d3d153a9c ``Record Data => "31:rlUK3C:0124:b89f873d3d153a9c"`` -3) Adding a the 500th piece of data: +3) Adding a the 500th piece of data: - * File UID: "rlUK3C" - * Row Index: "01AU" - * xxhash64_hexdigest: cf3fc53cad153a5a + * File UID: "rlUK3C" + * Row Index: "01AU" + * xxhash64_hexdigest: cf3fc53cad153a5a ``Record Data => "31:rlUK3C:01AU:cf3fc53cad153a5a"`` """ diff --git a/src/hangar/backends/numpy_10.py b/src/hangar/backends/numpy_10.py index fe28fd23..e1c9b02f 100644 --- a/src/hangar/backends/numpy_10.py +++ b/src/hangar/backends/numpy_10.py @@ -3,10 +3,10 @@ Backend Identifiers =================== -* Backend: ``1`` -* Version: ``0`` -* Format Code: ``10`` -* Canonical Name: ``NUMPY_10`` +* Backend: ``1`` +* Version: ``0`` +* Format Code: ``10`` +* Canonical Name: ``NUMPY_10`` Storage Method ============== @@ -15,11 +15,27 @@ * Each file is a zero-initialized array of - * ``dtype: {schema_dtype}``; ie ``np.float32`` or ``np.uint8`` + * ``dtype: {schema_dtype}``; ie ``np.float32`` or ``np.uint8`` - * ``shape: (COLLECTION_SIZE, *{schema_shape})``; ie ``(500, 10)`` or ``(500, - 4, 3)``. The first index in the array is referred to as a "collection - index". + * ``shape: (COLLECTION_SIZE, *{schema_shape})``; ie ``(500, 10)`` or ``(500, + 4, 3)``. The first index in the array is referred to as a "collection + index". + +Technical Notes +--------------- + +* A typical numpy memmap file persisted to disk does not retain information + about its datatype or shape, and as such must be provided when re-opened + after close. In order to persist a memmap in ``.npy`` format, we use the a + special function ``open_memmap`` imported from ``np.lib.format`` which can + open a memmap file and persist necessary header info to disk in ``.npy`` + format. + +* On each write, an ``xxhash64_hexdigest`` checksum is calculated. This is not + for use as the primary hash algorithm, but rather stored in the local record + format itself to serve as a quick way to verify no disk corruption occurred. + This is required since numpy has no built in data integrity validation + methods when reading from disk. Compression Options =================== @@ -32,49 +48,31 @@ Fields Recorded for Each Array ------------------------------ -* Format Code -* File UID -* xxhash64_hexdigest -* Collection Index (0:COLLECTION_SIZE subarray selection) -* Subarray Shape +* Format Code +* File UID +* xxhash64_hexdigest +* Collection Index (0:COLLECTION_SIZE subarray selection) +* Subarray Shape Examples -------- +1) Adding the first piece of data to a file: -1) Adding the first piece of data to a file: - - * Array shape (Subarray Shape): (10, 10) - * File UID: "K3ktxv" - * xxhash64_hexdigest: 94701dd9f32626e2 - * Collection Index: 488 + * Array shape (Subarray Shape): (10, 10) + * File UID: "K3ktxv" + * xxhash64_hexdigest: 94701dd9f32626e2 + * Collection Index: 488 ``Record Data => "10:K3ktxv:94701dd9f32626e2:488:10 10"`` -2) Adding to a piece of data to a the middle of a file: +2) Adding to a piece of data to a the middle of a file: - * Array shape (Subarray Shape): (20, 2, 3) - * File UID: "Mk23nl" - * xxhash64_hexdigest: 1363344b6c051b29 - * Collection Index: 199 + * Array shape (Subarray Shape): (20, 2, 3) + * File UID: "Mk23nl" + * xxhash64_hexdigest: 1363344b6c051b29 + * Collection Index: 199 ``Record Data => "10:Mk23nl:1363344b6c051b29:199:20 2 3"`` - - -Technical Notes -=============== - -* A typical numpy memmap file persisted to disk does not retain information - about its datatype or shape, and as such must be provided when re-opened - after close. In order to persist a memmap in ``.npy`` format, we use the a - special function ``open_memmap`` imported from ``np.lib.format`` which can - open a memmap file and persist necessary header info to disk in ``.npy`` - format. - -* On each write, an ``xxhash64_hexdigest`` checksum is calculated. This is not - for use as the primary hash algorithm, but rather stored in the local record - format itself to serve as a quick way to verify no disk corruption occurred. - This is required since numpy has no built in data integrity validation - methods when reading from disk. """ import os from collections import ChainMap diff --git a/src/hangar/backends/remote_50.py b/src/hangar/backends/remote_50.py index c049421e..54fe0dbd 100644 --- a/src/hangar/backends/remote_50.py +++ b/src/hangar/backends/remote_50.py @@ -3,20 +3,27 @@ Backend Identifiers =================== -* Backend: ``5`` -* Version: ``0`` -* Format Code: ``50`` -* Canonical Name: ``REMOTE_50`` +* Backend: ``5`` +* Version: ``0`` +* Format Code: ``50`` +* Canonical Name: ``REMOTE_50`` Storage Method ============== -* This backend merely acts to record that there is some data sample with some - ``hash`` and ``schema_shape`` present in the repository. It does not store the - actual data on the local disk, but indicates that if it should be retrieved, - you need to ask the remote hangar server for it. Once present on the local - disk, the backend locating info will be updated with one of the `local` data - backend specifications. +* This backend merely acts to record that there is some data sample with some + ``hash`` and ``schema_shape`` present in the repository. It does not store the + actual data on the local disk, but indicates that if it should be retrieved, + you need to ask the remote hangar server for it. Once present on the local + disk, the backend locating info will be updated with one of the `local` data + backend specifications. + +Technical Notes +--------------- + +* The schema_hash field is required in order to allow effective placement of + actual retrieved data into suitable sized collections on a ``fetch-data()`` + operation Record Format ============= @@ -24,8 +31,8 @@ Fields Recorded for Each Array ------------------------------ -* Format Code -* Schema Hash +* Format Code +* Schema Hash Separators used --------------- @@ -34,25 +41,17 @@ Examples -------- +1) Adding the first piece of data to a file: -1) Adding the first piece of data to a file: - - * Schema Hash: "ae43A21a" + * Schema Hash: "ae43A21a" ``Record Data => '50:ae43A21a'`` -1) Adding to a piece of data to a the middle of a file: +2) Adding to a piece of data to a the middle of a file: - * Schema Hash: "ae43A21a" + * Schema Hash: "ae43A21a" ``Record Data => '50:ae43A21a'`` - -Technical Notes -=============== - -* The schema_hash field is required in order to allow effective placement of - actual retrieved data into suitable sized collections on a ``fetch-data()`` - operation """ from pathlib import Path from typing import Optional diff --git a/src/hangar/bulk_importer.py b/src/hangar/bulk_importer.py index 16e14151..48ca77ee 100644 --- a/src/hangar/bulk_importer.py +++ b/src/hangar/bulk_importer.py @@ -211,84 +211,84 @@ def run_bulk_import( Examples -------- - >>> import os - >>> import numpy as np - >>> from PIL import Image - >>> from hangar.bulk_importer import UDF_Return, run_bulk_import - >>> def image_loader(file_path): - ... im = Image.open(file_name) - ... arr = np.array(im.resize(512, 512)) - ... im_record = UDF_Return(column='image', key=(category, sample), data=arr) - ... yield im_record - ... - ... root, sample_file = os.path.split(file_path) - ... category = os.path.dirname(root) - ... sample_name, _ = os.path.splitext(sample_file) - ... path_record = UDF_Return(column='file_str', key=(category, sample_name), data=file_path) - ... yield path_record - ... - >>> udf_kwargs = [ - ... {'file_path': '/foo/cat/image_001.jpeg'}, - ... {'file_path': '/foo/cat/image_002.jpeg'}, - ... {'file_path': '/foo/dog/image_001.jpeg'}, - ... {'file_path': '/foo/bird/image_011.jpeg'}, - ... {'file_path': '/foo/bird/image_003.jpeg'} - ... ] - >>> repo = Repository('foo/path/to/repo') - >>> run_bulk_import( - ... repo, branch_name='master', column_names=['file_str', 'image'], - ... udf=image_loader, udf_kwargs=udf_kwargs) + >>> import os + >>> import numpy as np + >>> from PIL import Image + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import + >>> def image_loader(file_path): + ... im = Image.open(file_name) + ... arr = np.array(im.resize(512, 512)) + ... im_record = UDF_Return(column='image', key=(category, sample), data=arr) + ... yield im_record + ... + ... root, sample_file = os.path.split(file_path) + ... category = os.path.dirname(root) + ... sample_name, _ = os.path.splitext(sample_file) + ... path_record = UDF_Return(column='file_str', key=(category, sample_name), data=file_path) + ... yield path_record + ... + >>> udf_kwargs = [ + ... {'file_path': '/foo/cat/image_001.jpeg'}, + ... {'file_path': '/foo/cat/image_002.jpeg'}, + ... {'file_path': '/foo/dog/image_001.jpeg'}, + ... {'file_path': '/foo/bird/image_011.jpeg'}, + ... {'file_path': '/foo/bird/image_003.jpeg'} + ... ] + >>> repo = Repository('foo/path/to/repo') + >>> run_bulk_import( + ... repo, branch_name='master', column_names=['file_str', 'image'], + ... udf=image_loader, udf_kwargs=udf_kwargs) However, the following will not work, since the output is non-deterministic. - >>> from hangar.bulk_importer import UDF_Return, run_bulk_import - >>> def nondeterminstic(x, y): - ... first = str(x * y) - ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=first) - ... - ... second = str(x * y * random()) - ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=second) - ... - >>> udf_kwargs = [ - ... {'x': 1, 'y': 2}, - ... {'x': 1, 'y': 3}, - ... {'x': 2, 'y': 4}, - ... ] - >>> run_bulk_import( - ... repo, branch_name='master', column_names=['valstr'], - ... udf=image_loader, udf_kwargs=udf_kwargs) - Traceback (most recent call last): - `File "", line 1, in ` - TypeError: contents returned in subbsequent calls to UDF with identical - kwargs yielded different results. UDFs MUST generate deterministic - results for the given inputs. Input kwargs generating this result: - {'x': 1, 'y': 2}. + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import + >>> def nondeterminstic(x, y): + ... first = str(x * y) + ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=first) + ... + ... second = str(x * y * random()) + ... yield UDF_Return(column='valstr', key=f'{x}_{y}', data=second) + ... + >>> udf_kwargs = [ + ... {'x': 1, 'y': 2}, + ... {'x': 1, 'y': 3}, + ... {'x': 2, 'y': 4}, + ... ] + >>> run_bulk_import( + ... repo, branch_name='master', column_names=['valstr'], + ... udf=image_loader, udf_kwargs=udf_kwargs) + Traceback (most recent call last): + `File "", line 1, in ` + TypeError: contents returned in subbsequent calls to UDF with identical + kwargs yielded different results. UDFs MUST generate deterministic + results for the given inputs. Input kwargs generating this result: + {'x': 1, 'y': 2}. Not all columns must be returned from every input to the UDF, the number of data pieces yielded can also vary arbitrarily (so long as the results are deterministic for a particular set of inputs) - >>> import numpy as np - >>> from hangar.bulk_importer import UDF_Return, run_bulk_import - >>> def maybe_load(x_arr, y_arr, sample_name, columns=['default']): - ... for column in columns: - ... arr = np.multiply(x_arr, y_arr) - ... yield UDF_Return(column=column, key=sample_name, data=arr) - ... # - ... # do some strange processing which only outputs another column sometimes - ... if len(columns) == 1: - ... other = np.array(x_arr.shape) * np.array(y_arr.shape) - ... yield UDF_Return(column='strange_column', key=sample_name, data=other) - ... - >>> udf_kwargs = [ - ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_1'}, - ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_2', 'columns': ['foo', 'bar', 'default']}, - ... {'x_arr': np.arange(10) * 2, 'y_arr': np.arange(10), 'sample_name': 'sample_3'}, - ... ] - >>> run_bulk_import( - ... repo, branch_name='master', - ... column_names=['default', 'foo', 'bar', 'strange_column'], - ... udf=maybe_load, udf_kwargs=udf_kwargs) + >>> import numpy as np + >>> from hangar.bulk_importer import UDF_Return, run_bulk_import + >>> def maybe_load(x_arr, y_arr, sample_name, columns=['default']): + ... for column in columns: + ... arr = np.multiply(x_arr, y_arr) + ... yield UDF_Return(column=column, key=sample_name, data=arr) + ... # + ... # do some strange processing which only outputs another column sometimes + ... if len(columns) == 1: + ... other = np.array(x_arr.shape) * np.array(y_arr.shape) + ... yield UDF_Return(column='strange_column', key=sample_name, data=other) + ... + >>> udf_kwargs = [ + ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_1'}, + ... {'x_arr': np.arange(10), 'y_arr': np.arange(10) + 1, 'sample_name': 'sample_2', 'columns': ['foo', 'bar', 'default']}, + ... {'x_arr': np.arange(10) * 2, 'y_arr': np.arange(10), 'sample_name': 'sample_3'}, + ... ] + >>> run_bulk_import( + ... repo, branch_name='master', + ... column_names=['default', 'foo', 'bar', 'strange_column'], + ... udf=maybe_load, udf_kwargs=udf_kwargs) Parameters ---------- diff --git a/src/hangar/checkout.py b/src/hangar/checkout.py index b40e7deb..e96d4829 100644 --- a/src/hangar/checkout.py +++ b/src/hangar/checkout.py @@ -3,7 +3,7 @@ import weakref from contextlib import suppress, ExitStack from uuid import uuid4 -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING import numpy as np import lmdb @@ -32,6 +32,9 @@ schema_record_db_val_from_digest, ) +if TYPE_CHECKING: + from .columns import ModifierTypes + class ReaderCheckout(GetMixin, CheckoutDictIteration): """Checkout the repository as it exists at a particular branch. @@ -57,24 +60,27 @@ class ReaderCheckout(GetMixin, CheckoutDictIteration): 'someothercommithashhere' >>> co.close() - Unlike :class:`WriterCheckout`, any number of :class:`ReaderCheckout` - objects can exist on the repository independently. Like the - ``write-enabled`` variant, the :meth:`close` method should be called after + Unlike [Writer Checkout](#hangar.checkout.WriterCheckout), any number + of [Reader Checkout](#hangar.checkout.ReaderCheckout) objects can exist on + the repository independently. Like the ``write-enabled`` variant, the + [close()](#hangar.checkout.ReaderCheckout.close) method should be called after performing the necessary operations on the repo. However, as there is no concept of a ``lock`` for ``read-only`` checkouts, this is just to free up memory resources, rather than changing recorded access state. In order to reduce the chance that the python interpreter is shut down - without calling :meth:`close`, - a common mistake during ipython / jupyter - sessions - an `atexit `_ - hook is registered to :meth:`close`. If properly closed by the user, the - hook is unregistered after completion with no ill effects. So long as a the - process is NOT terminated via non-python ``SIGKILL``, fatal internal python - error, or or special ``os exit`` methods, cleanup will occur on interpreter - shutdown and resources will be freed. If a non-handled termination method - does occur, the implications of holding resources varies on a per-OS basis. - While no risk to data integrity is observed, repeated misuse may require a - system reboot in order to achieve expected performance characteristics. + without calling [close()](#hangar.checkout.ReaderCheckout.close), - a + common mistake during ipython / jupyter sessions - an + [atexit](https://docs.python.org/3/library/atexit.html) + hook is registered to [close()](#hangar.checkout.ReaderCheckout.close). + If properly closed by the user, the hook is unregistered after completion + with no ill effects. So long as a the process is NOT terminated via non-python + ``SIGKILL``, fatal internal python error, or or special ``os exit`` methods, + cleanup will occur on interpreter shutdown and resources will be freed. If + a non-handled termination method does occur, the implications of holding resources + varies on a per-OS basis. While no risk to data integrity is observed, + repeated misuse may require a system reboot in order to achieve expected + performance characteristics.. """ def __init__(self, @@ -178,33 +184,32 @@ def columns(self) -> Columns: Can be used to either return the columns accessor for all elements or a single column instance by using dictionary style indexing. - >>> co = repo.checkout(write=False) - >>> len(co.columns) - 1 - >>> print(co.columns.keys()) - ['foo'] - >>> fooCol = co.columns['foo'] - >>> fooCol.dtype - np.fooDtype - >>> cols = co.columns - >>> fooCol = cols['foo'] - >>> fooCol.dtype - np.fooDtype - >>> fooCol = cols.get('foo') - >>> fooCol.dtype - np.fooDtype - - .. seealso:: - - The class :class:`~.columns.column.Columns` contains all methods - accessible by this property accessor + >>> co = repo.checkout(write=False) + >>> len(co.columns) + 1 + >>> print(co.columns.keys()) + ['foo'] + >>> fooCol = co.columns['foo'] + >>> fooCol.dtype + np.fooDtype + >>> cols = co.columns + >>> fooCol = cols['foo'] + >>> fooCol.dtype + np.fooDtype + >>> fooCol = cols.get('foo') + >>> fooCol.dtype + np.fooDtype + + !!! seealso + + The class [Columns](#hangar.columns.column.Columns) contains + all methods accessible by this property accessor Returns ------- - :class:`~.columns.column.Columns` - the columns object which behaves exactly like a - columns accessor class but which can be invalidated when the writer - lock is released. + Columns + the columns object which behaves exactly like a columns accessor class + but which can be invalidated when the writer lock is released. """ self._verify_alive() return self._columns @@ -213,7 +218,7 @@ def columns(self) -> Columns: def diff(self) -> ReaderUserDiff: """Access the differ methods for a read-only checkout. - .. seealso:: + !!! seealso The class :class:`ReaderUserDiff` contains all methods accessible by this property accessor @@ -233,9 +238,9 @@ def diff(self) -> ReaderUserDiff: def commit_hash(self) -> str: """Commit hash this read-only checkout's data is read from. - >>> co = repo.checkout() - >>> co.commit_hash - foohashdigesthere + >>> co = repo.checkout() + >>> co.commit_hash + foohashdigesthere Returns ------- @@ -254,15 +259,15 @@ def log(self, show_user: bool = False) -> Optional[dict]: """Displays a pretty printed commit log graph to the terminal. - .. note:: + If Neither `branch` nor `commit` arguments are supplied, the commit + digest of the currently reader checkout will be used as default. + + !!! note For programatic access, the return_contents value can be set to true which will retrieve relevant commit specifications as dictionary elements. - if Neither `branch` nor `commit` arguments are supplied, the commit - digest of the currently reader checkout will be used as default. - Parameters ---------- branch @@ -279,6 +284,7 @@ def log(self, show_user If true and return_contents is False, show the committer of each commit on the printed log graph + Returns ------- Optional[dict] @@ -335,25 +341,28 @@ class WriterCheckout(GetMixin, CheckoutDictIteration): At the moment, only one instance of this class can write data to the staging area at a time. After the desired operations have been completed, - it is crucial to call :meth:`close` to release the writer lock. In - addition, after any changes have been made to the staging area, the branch - ``HEAD`` cannot be changed. In order to checkout another branch ``HEAD`` - for writing, you must either :meth:`commit` the changes, or perform a + it is crucial to call [close()](#hangar.checkout.WriterCheckout.close) + to release the writer lock. In addition, after any changes have been + made to the staging area, the branch ``HEAD`` cannot be changed. In + order to checkout another branch ``HEAD`` for writing, you must either + [commit()](#hangar.checkout.WriterCheckout.commit) the changes, or perform a hard-reset of the staging area to the last commit via - :meth:`reset_staging_area`. + [reset_staging_area()](#hangar.checkout.WriterCheckout.reset_staging_area). In order to reduce the chance that the python interpreter is shut down - without calling :meth:`close`, which releases the writer lock - a common - mistake during ipython / jupyter sessions - an `atexit - `_ hook is registered to - :meth:`close`. If properly closed by the user, the hook is unregistered + without calling [close()](#hangar.checkout.WriterCheckout.close), which + releases the writer lock - a common mistake during ipython / jupyter + sessions - an [atexit](https://docs.python.org/3/library/atexit.html) + hook is registered to [close()](#hangar.checkout.WriterCheckout.close). + If properly closed by the user, the hook is unregistered after completion with no ill effects. So long as a the process is NOT terminated via non-python SIGKILL, fatal internal python error, or or special os exit methods, cleanup will occur on interpreter shutdown and the writer lock will be released. If a non-handled termination method does - occur, the :meth:`~.Repository.force_release_writer_lock` method must be - called manually when a new python process wishes to open the writer - checkout. + occur, the + [force_release_writer_lock()](#hangar.repository.Repository.force_release_writer_lock) + method must be called manually when a new python process wishes to open + the writer checkout. """ def __init__(self, @@ -560,14 +569,14 @@ def columns(self) -> Columns: >>> 'bar' in co.columns False - .. seealso:: + !!! seealso - The class :class:`~.columns.column.Columns` contains all methods + The class [Columns](#hangar.columns.column.Columns) contains all methods accessible by this property accessor Returns ------- - :class:`~.columns.column.Columns` + Columns the columns object which behaves exactly like a columns accessor class but which can be invalidated when the writer lock is released. @@ -579,9 +588,9 @@ class but which can be invalidated when the writer lock is def diff(self) -> WriterUserDiff: """Access the differ methods which are aware of any staged changes. - .. seealso:: + !!! seealso - The class :class:`hangar.diff.WriterUserDiff` contains all methods + The [Diff Class](#hangar.diff.WriterUserDiff) contains all methods accessible by this property accessor Returns @@ -630,7 +639,7 @@ def log(self, show_user: bool = False) -> Optional[dict]: """Displays a pretty printed commit log graph to the terminal. - .. note:: + !!! note For programatic access, the return_contents value can be set to true which will retrieve relevant commit specifications as dictionary @@ -677,11 +686,11 @@ def add_str_column(self, contains_subsamples: bool = False, *, backend: Optional[str] = None, - backend_options: Optional[dict] = None): - """Initializes a :class:`str` container column + backend_options: Optional[dict] = None) -> 'ModifierTypes': + """Initializes a ``str`` class container column Columns are created in order to store some arbitrary collection of data - pieces. In this case, we store :class:`str` data. Items need not be + pieces. In this case, we store ``str`` data. Items need not be related to each-other in any direct capacity; the only criteria hangar requires is that all pieces of data stored in the column have a compatible schema with each-other (more on this below). Each piece of @@ -692,7 +701,7 @@ def add_str_column(self, pointing to some piece of store data on disk) are supported. All data pieces within a column have the same data type. For - :class:`str` columns, there is no distinction between + ``str`` columns, there is no distinction between ``'variable_shape'`` and ``'fixed_shape'`` schema types. Values are allowed to take on a value of any size so long as the datatype and contents are valid for the schema definition. @@ -718,7 +727,7 @@ def add_str_column(self, Returns ------- - :class:`~.columns.column.Columns` + 'ModifierTypes' instance object of the initialized column. """ self._verify_alive() @@ -758,11 +767,11 @@ def add_bytes_column(self, contains_subsamples: bool = False, *, backend: Optional[str] = None, - backend_options: Optional[dict] = None): - """Initializes a :class:`bytes` container column + backend_options: Optional[dict] = None) -> 'ModifierTypes': + """Initializes a``bytes`` container column Columns are created in order to store some arbitrary collection of data - pieces. In this case, we store :class:`bbytes` data. Items need not be + pieces. In this case, we store ``bytes`` data. Items need not be related to each-other in any direct capacity; the only criteria hangar requires is that all pieces of data stored in the column have a compatible schema with each-other (more on this below). Each piece of @@ -773,7 +782,7 @@ def add_bytes_column(self, pointing to some piece of store data on disk) are supported. All data pieces within a column have the same data type. For - :class:`bytes` columns, there is no distinction between + ``bytes`` columns, there is no distinction between ``'variable_shape'`` and ``'fixed_shape'`` schema types. Values are allowed to take on a value of any size so long as the datatype and contents are valid for the schema definition. @@ -799,7 +808,7 @@ def add_bytes_column(self, Returns ------- - :class:`~.columns.column.Columns` + 'ModifierTypes' instance object of the initialized column. """ self._verify_alive() @@ -843,11 +852,11 @@ def add_ndarray_column(self, contains_subsamples: bool = False, *, backend: Optional[str] = None, - backend_options: Optional[dict] = None): - """Initializes a :class:`numpy.ndarray` container column. + backend_options: Optional[dict] = None) -> 'ModifierTypes': + """Initializes a ``numpy.ndarray`` container column. Columns are created in order to store some arbitrary collection of data - pieces. In this case, we store :class:`numpy.ndarray` data. Items need + pieces. In this case, we store ``numpy.ndarray`` data. Items need not be related to each-other in any direct capacity; the only criteria hangar requires is that all pieces of data stored in the column have a compatible schema with each-other (more on this below). Each piece of @@ -903,7 +912,7 @@ def add_ndarray_column(self, Returns ------- - :class:`~.columns.column.Columns` + 'ModifierTypes' instance object of the initialized column. """ self._verify_alive() @@ -959,7 +968,7 @@ def add_ndarray_column(self, def _initialize_new_column(self, column_name: str, column_layout: str, - schema) -> Columns: + schema) -> 'ModifierTypes': """Initialize a column and write spec to record db. Parameters @@ -976,7 +985,7 @@ def _initialize_new_column(self, Returns ------- - Columns + 'ModifierTypes' initialized column class instance. """ # -------- set vals in lmdb only after schema is sure to exist -------- @@ -1111,12 +1120,11 @@ def reset_staging_area(self, *, force=False) -> str: """Perform a hard reset of the staging area to the last commit head. After this operation completes, the writer checkout will automatically - close in the typical fashion (any held references to :attr:``column`` - or :attr:``metadata`` objects will finalize and destruct as normal), In - order to perform any further operation, a new checkout needs to be - opened. + close in the typical fashion (any held references to``column`` + objects will finalize and destruct as normal), In order to perform any + further operation, a new checkout needs to be opened. - .. warning:: + !!! danger This operation is IRREVERSIBLE. all records and data which are note stored in a previous commit will be permanently deleted. diff --git a/src/hangar/columns/column.py b/src/hangar/columns/column.py index 74331f91..0d8daa6f 100644 --- a/src/hangar/columns/column.py +++ b/src/hangar/columns/column.py @@ -50,7 +50,7 @@ def __init__(self, txnctx: Optional[ColumnTxn] = None): """Developer documentation for init method. - .. warning:: + !!! warning This class should not be instantiated directly. Instead use the factory functions :py:meth:`_from_commit` or :py:meth:`_from_staging` to return diff --git a/src/hangar/columns/layout_flat.py b/src/hangar/columns/layout_flat.py index 2d95d9b4..3499c7d3 100644 --- a/src/hangar/columns/layout_flat.py +++ b/src/hangar/columns/layout_flat.py @@ -203,7 +203,9 @@ def _close(self): def __getitem__(self, key: KeyType): """Retrieve data for some sample key via dict style access conventions. - .. seealso:: :meth:`get` + !!! seealso + + :meth:`get` Parameters ---------- @@ -522,7 +524,7 @@ def _perform_set(self, key, value): def __setitem__(self, key, value): """Store a piece of data in a column. - .. seealso:: + !!! seealso :meth:`update` for an implementation analogous to python's built in :meth:`dict.update` method which accepts a dict or iterable of @@ -621,7 +623,7 @@ def update(self, other=None, **kwargs): def __delitem__(self, key: KeyType) -> None: """Remove a sample from the column. Convenience method to :meth:`delete`. - .. seealso:: + !!! seealso :meth:`pop` to return a value and then delete it in the same operation @@ -673,7 +675,7 @@ def pop(self, key: KeyType): def change_backend(self, backend: str, backend_options: Optional[dict] = None): """Change the default backend and filters applied to future data writes. - .. warning:: + !!! warning This method is meant for advanced users only. Please refer to the hangar backend codebase for information on accepted parameters and diff --git a/src/hangar/columns/layout_nested.py b/src/hangar/columns/layout_nested.py index f45c899d..6edbb71c 100644 --- a/src/hangar/columns/layout_nested.py +++ b/src/hangar/columns/layout_nested.py @@ -145,7 +145,9 @@ def __iter__(self) -> Iterable[KeyType]: def __getitem__(self, key: GetKeysType) -> Union[Any, Dict[KeyType, Any]]: """Retrieve data for some subsample key via dict style access conventions. - .. seealso:: :meth:`get` + !!! seealso + + :meth:`get` Parameters ---------- @@ -468,7 +470,7 @@ def _perform_set(self, key, value): def __setitem__(self, key, value): """Store data as a subsample. Convenience method to :meth:`add`. - .. seealso:: + !!! seealso :meth:`update` for an implementation analogous to python's built in :meth:`dict.update` method which accepts a dict or iterable of @@ -497,7 +499,7 @@ def append(self, value) -> KeyType: Think carefully before going this route, as this posit does not apply to many common use cases. - .. seealso:: + !!! seealso In order to store the data with a user defined key, use :meth:`update` or :meth:`__setitem__` @@ -565,7 +567,7 @@ def update(self, other=None, **kwargs): def __delitem__(self, key: KeyType): """Remove a subsample from the column.`. - .. seealso:: + !!! seealso :meth:`pop` to simultaneously get a keys value and delete it. @@ -1053,7 +1055,7 @@ def _perform_set(self, key, value) -> None: def __setitem__(self, key, value) -> None: """Store some subsample key / subsample data map, overwriting existing keys. - .. seealso:: + !!! seealso :meth:`update` for alternative syntax for setting values. """ @@ -1107,7 +1109,7 @@ def update(self, other=None, **kwargs) -> None: def __delitem__(self, key: KeyType): """Remove a sample (including all contained subsamples) from the column. - .. seealso:: + !!! seealso :meth:`pop` for alternative implementing a simultaneous get value and delete operation. @@ -1146,7 +1148,7 @@ def pop(self, key: KeyType) -> Dict[KeyType, Any]: def change_backend(self, backend: str, backend_options: Optional[dict] = None): """Change the default backend and filters applied to future data writes. - .. warning:: + !!! warning This method is meant for advanced users only. Please refer to the hangar backend codebase for information on accepted parameters and diff --git a/src/hangar/dataloaders/tfloader.py b/src/hangar/dataloaders/tfloader.py index 583b73f1..82ae0356 100644 --- a/src/hangar/dataloaders/tfloader.py +++ b/src/hangar/dataloaders/tfloader.py @@ -35,7 +35,7 @@ def make_tf_dataset(columns, `make_tf_dataset` accepts a `shuffle` argument which will be used by the generator to shuffle each time it is being called. - .. warning:: + !!! warning `tf.data.Dataset.from_generator` currently uses `tf.compat.v1.py_func()` internally. Hence the serialization function (`yield_data`) will not be diff --git a/src/hangar/dataloaders/torchloader.py b/src/hangar/dataloaders/torchloader.py index dd61d301..eaf0cf60 100644 --- a/src/hangar/dataloaders/torchloader.py +++ b/src/hangar/dataloaders/torchloader.py @@ -22,7 +22,7 @@ def make_torch_dataset(columns, Returns a :class:`torch.utils.data.Dataset` object which can be loaded into a :class:`torch.utils.data.DataLoader`. - .. warning:: + !!! warning On Windows systems, setting the parameter ``num_workers`` in the resulting :class:`torch.utils.data.DataLoader` method will result in a @@ -96,7 +96,7 @@ class TorchDataset(torchdata.Dataset): convenient arguments to wrap hangar columns to be used in :class:`torch.utils.data.DataLoaders`. - .. note:: + !!! note From PyTorch 1.1 onwards, if Dataset returns dict, DataLoader also returns dict diff --git a/src/hangar/diff.py b/src/hangar/diff.py index 8b9cbc4e..9c76cb50 100644 --- a/src/hangar/diff.py +++ b/src/hangar/diff.py @@ -403,20 +403,17 @@ class ReaderUserDiff(BaseUserDiff): it can only contain non-empty values in the cases where a three way merge would need to be performed. - :: - - Fast Forward is Possible - ======================== + Fast Forward is Possible + ======================== (master) (foo) - a ----- b ----- c ----- d - + a ----- b ----- c ----- d - 3-Way Merge Required - ==================== + 3-Way Merge Required + ==================== (master) - a ----- b ----- c ----- d + a ----- b ----- c ----- d \\ \\ (foo) \\----- ee ----- ff @@ -531,20 +528,18 @@ class WriterUserDiff(BaseUserDiff): it can only contain non-empty values in the cases where a three way merge would need to be performed. - :: - - Fast Forward is Possible - ======================== + Fast Forward is Possible + ======================== (master) (foo) - a ----- b ----- c ----- d + a ----- b ----- c ----- d - 3-Way Merge Required - ==================== + 3-Way Merge Required + ==================== (master) - a ----- b ----- c ----- d + a ----- b ----- c ----- d \\ \\ (foo) \\----- ee ----- ff diff --git a/src/hangar/merger.py b/src/hangar/merger.py index ea798886..99d2fd64 100644 --- a/src/hangar/merger.py +++ b/src/hangar/merger.py @@ -4,7 +4,7 @@ three-way merge algorithm are implemented. All user facing API calls should be funneled through the :function:`select_merge_algorithm` function -.. note:: +!!! note In the current implementation, it is not possible to stop a merge in progress or to revert a bad merge commit. All revert like operations should be made by diff --git a/src/hangar/records/commiting.py b/src/hangar/records/commiting.py index 866051ab..27c17f75 100644 --- a/src/hangar/records/commiting.py +++ b/src/hangar/records/commiting.py @@ -563,7 +563,7 @@ def commit_records(message, branchenv, stageenv, refenv, repo_path: Path, def replace_staging_area_with_commit(refenv, stageenv, commit_hash): """DANGER ZONE: Delete the stage db and replace it with a copy of a commit environment. - .. warning:: + !!! warning In the current implementation, this method will not validate that it is safe to do this operation. All validation logic must be handled upstream. @@ -592,7 +592,7 @@ def replace_staging_area_with_commit(refenv, stageenv, commit_hash): def replace_staging_area_with_refs(stageenv, sorted_content): """DANGER ZONE: Delete all stage db records and replace it with specified data. - .. warning:: + !!! warning In the current implementation, this method will not validate that it is safe to do this operation. All validation logic must be handled upstream. diff --git a/src/hangar/records/summarize.py b/src/hangar/records/summarize.py index a60a93ae..e6c69ee4 100644 --- a/src/hangar/records/summarize.py +++ b/src/hangar/records/summarize.py @@ -32,7 +32,7 @@ def log(branchenv: lmdb.Environment, show_user: bool = False): """Displays a pretty printed commit log graph to the terminal. - .. note:: + !!! note For programatic access, the return_contents value can be set to true which will retrieve relevant commit specifications as dictionary diff --git a/src/hangar/remotes.py b/src/hangar/remotes.py index 37517eaa..5d30ab71 100644 --- a/src/hangar/remotes.py +++ b/src/hangar/remotes.py @@ -39,14 +39,13 @@ class Remotes(object): """Class which governs access to remote interactor objects. - .. note:: - - The remote-server implementation is under heavy development, and is - likely to undergo changes in the Future. While we intend to ensure - compatability between software versions of Hangar repositories written - to disk, the API is likely to change. Please follow our process at: - https://www.github.com/tensorwerk/hangar-py + !!! note + The remote-server implementation is under heavy development, and is + likely to undergo changes in the Future. While we intend to ensure + compatability between software versions of Hangar repositories written + to disk, the API is likely to change. Please follow our process at: + https://www.github.com/tensorwerk/hangar-py """ def __init__(self, env: Environments): @@ -71,6 +70,11 @@ def __verify_repo_initialized(self): def add(self, name: str, address: str) -> RemoteInfo: """Add a remote to the repository accessible by `name` at `address`. + >>> from hangar import Repository + >>> repo = Repository('foo/path') + >>> repo.remote.add('origin', 'localhost:50051') + RemoteInfo(name='origin', address='localhost:50051') + Parameters ---------- name @@ -110,6 +114,13 @@ def add(self, name: str, address: str) -> RemoteInfo: def remove(self, name: str) -> RemoteInfo: """Remove a remote repository from the branch records + >>> from hangar import Repository + >>> repo = Repository('foo/path') + >>> repo.remote.add('origin', 'localhost:50051') + RemoteInfo(name='origin', address='localhost:50051') + >>> repo.remote.remove('origin') + RemoteInfo(name='origin', address='localhost:50051') + Parameters ---------- name @@ -122,7 +133,7 @@ def remove(self, name: str) -> RemoteInfo: Returns ------- - str + RemoteInfo The channel address which was removed at the given remote name """ self.__verify_repo_initialized() @@ -135,6 +146,16 @@ def remove(self, name: str) -> RemoteInfo: def list_all(self) -> List[RemoteInfo]: """List all remote names and addresses recorded in the client's repository. + >>> from hangar import Repository + >>> repo = Repository('foo/path') + >>> repo.remote.add('origin', 'localhost:50051') + RemoteInfo(name='origin', address='localhost:50051') + >>> repo.remote.add('upstream'. 'localhost:50052') + RemoteInfo(name='upstream', address='localhost:50052') + >>> repo.remote.list_all() + [RemoteInfo(name='origin', address='localhost:50051'), + RemoteInfo(name='upstream', address='localhost:50052')] + Returns ------- List[RemoteInfo] @@ -152,6 +173,13 @@ def list_all(self) -> List[RemoteInfo]: def ping(self, name: str) -> float: """Ping remote server and check the round trip time. + >>> from hangar import Repository + >>> repo = Repository('foo/path') + >>> repo.remote.add('origin', 'localhost:50051') + RemoteInfo(name='origin', address='localhost:50051') + >>> repo.remote.ping('origin') + 0.0523 + Parameters ---------- name @@ -287,7 +315,7 @@ def fetch_data_sample(self, commit: Optional[str] = None) -> str: """Granular fetch data operation allowing selection of individual samples. - .. warning:: + !!! warning This is a specialized version of the :meth:`fetch_data` method for use in specilized situations where some prior knowledge is known about the data. @@ -305,18 +333,18 @@ def fetch_data_sample(self, name of the remote server to pull data from column name of the column which data is being fetched from. - sample + samples Key, or sequence of sample keys to select. - * Flat column layouts should provide just a single key, or flat sequence of - keys which will be fetched from the server. ie. `sample1` OR - [`sample1`, `sample2`, `sample3`, etc.] + - Flat column layouts should provide just a single key, or flat sequence of + keys which will be fetched from the server. ie. `sample1` OR + [`sample1`, `sample2`, `sample3`, etc.] - * Nested column layouts can provide tuples specifying `(sample, subsample)` - records to retrieve, tuples with an `Ellipsis` character in the `subsample` - index `(sample, ...)` (which will fetch all subsamples for the given sample), - or can provide lone sample keys in the sequences `sample` (which will also fetch - all subsamples listed under the sample) OR ANY COMBINATION of the above. + - Nested column layouts can provide tuples specifying `(sample, subsample)` + records to retrieve, tuples with an `Ellipsis` character in the `subsample` + index `(sample, ...)` (which will fetch all subsamples for the given sample), + or can provide lone sample keys in the sequences `sample` (which will also fetch + all subsamples listed under the sample) OR ANY COMBINATION of the above. branch branch head to operate on, either ``branch`` or ``commit`` argument must be passed, but NOT both. Default is ``None`` @@ -401,15 +429,14 @@ def _select_digests_fetch_data_sample( """Map sample keys to data record digest Depending on column layout, the mapping of samples -> digests - is handled differently. + is handled differently: + + * "flat" columns: There is a direct map of sample key -> digest. If a + sample does not exist in the column, it is a key error. - "flat" columns: - There is a direct map of sample key -> digest. If a sample - does not exist in the column, it is a key error. - "nested" column: - There is a layered mapping of sample key -> subsamples -> digests - We take the approach that only specifying a sample key results - in fetching all subsamples contained under it. + * "nested" column: There is a layered mapping of sample key -> + subsamples -> digests We take the approach that only specifying + a sample key results in fetching all subsamples contained under it. Parameters ---------- @@ -622,11 +649,6 @@ def _form_missing_schema_digest_map( ) -> Dict[str, List[str]]: """Calculate mapping of schemas to data digests. - Parameters - ---------- - selectedDataRecords - hashenv - Returns ------- Dict[str, List[str]] @@ -685,7 +707,7 @@ def push(self, remote: str, branch: str, This method is semantically identical to a ``git push`` operation. Any local updates will be sent to the remote repository. - .. note:: + !!! note The current implementation is not capable of performing a ``force push`` operation. As such, remote branches with diverged diff --git a/src/hangar/repository.py b/src/hangar/repository.py index 9ac9c98c..fa3fa418 100644 --- a/src/hangar/repository.py +++ b/src/hangar/repository.py @@ -31,8 +31,8 @@ class Repository(object): repository, or to a directory one should be initialized, and all required data for starting your work on the repo will automatically be populated. - >>> from hangar import Repository - >>> repo = Repository('foo/path/to/dir') + >>> from hangar import Repository + >>> repo = Repository('foo/path/to/dir') Parameters ---------- @@ -41,7 +41,8 @@ class Repository(object): exists True if a Hangar repository should exist at the given directory path. Should no Hangar repository exists at that location, a UserWarning will - be raised indicating that the :meth:`init` method needs to be called. + be raised indicating that the [init()](#hangar.repository.Repository.init) + method needs to be called. False if the provided path does not need to (but optionally can) contain a Hangar repository. if a Hangar repository does not exist at that path, the @@ -128,9 +129,10 @@ def __verify_repo_initialized(self): def remote(self) -> Remotes: """Accessor to the methods controlling remote interactions. - .. seealso:: + !!! seealso - :class:`Remotes` for available methods of this property + Class [Remotes](#hangar.remotes.Remotes) for available + methods of this property Returns ------- @@ -166,7 +168,7 @@ def writer_lock_held(self) -> bool: @property def version(self) -> str: - """Find the version of Hangar software the repository is written with + """Find the version of Hangar software the repository is written with. Returns ------- @@ -193,10 +195,10 @@ def initialized(self) -> bool: def size_nbytes(self) -> int: """Disk space used by the repository returned in number of bytes. - >>> repo.size_nbytes - 1234567890 - >>> print(type(repo.size_nbytes)) - + >>> repo.size_nbytes + 1234567890 + >>> print(type(repo.size_nbytes)) + Returns ------- @@ -210,10 +212,10 @@ def size_nbytes(self) -> int: def size_human(self) -> str: """Disk space used by the repository returned in human readable string. - >>> repo.size_human - '1.23 GB' - >>> print(type(repo.size_human)) - + >>> repo.size_human + '1.23 GB' + >>> print(type(repo.size_human)) + Returns ------- @@ -298,14 +300,18 @@ def checkout(self, except (RuntimeError, ValueError) as e: raise e from None - def clone(self, user_name: str, user_email: str, remote_address: str, - *, remove_old: bool = False) -> str: + def clone(self, + user_name: str, + user_email: str, + remote_address: str, + *, + remove_old: bool = False) -> str: """Download a remote repository to the local disk. The clone method implemented here is very similar to a `git clone` operation. This method will pull all commit records, history, and data which are parents of the remote's `master` branch head commit. If a - :class:`Repository` exists at the specified directory, + [Repository](#hangar.repository.Repository) exists at the specified directory, the operation will fail. Parameters @@ -317,16 +323,19 @@ def clone(self, user_name: str, user_email: str, remote_address: str, Email address of the repository user. This information is recorded permanently in any commits created. remote_address - location where the - :class:`hangar.remote.server.HangarServer` process is - running and accessible by the clone user. + location where the [Hangar Server](#hangar.remote.server.HangarServer) + process is running and accessible by the clone user. remove_old - DANGER! DEVELOPMENT USE ONLY! If enabled, a - :class:`hangar.repository.Repository` existing on disk at the same - path as the requested clone location will be completely removed and - replaced with the newly cloned repo. (the default is False, which - will not modify any contents on disk and which will refuse to create - a repository at a given location if one already exists there.) + Development Use Only! + + !!! danger + + If enabled, a [Repository](#hangar.repository.Repository) existing + on disk at the same path as the requested clone location will be + completely removed and replaced with the newly cloned repo. (the + default is False, which will not modify any contents on disk and + which will refuse to create a repository at a given location if + one already exists there.) Returns ------- @@ -361,8 +370,16 @@ def init(self, user_email Email address of the repository user account. remove_old - DEVELOPER USE ONLY -- remove and reinitialize a Hangar - repository at the given path, Default = False + Development Use Only! + + !!! danger + + If enabled, a [Repository](#hangar.repository.Repository) existing + on disk at the same path as the requested init location will be + completely removed and replaced with an empty repo. (the + default is False, which will not modify any contents on disk and + which will refuse to create a repository at a given location if + one already exists there.) Returns ------- @@ -384,10 +401,10 @@ def log(self, show_user: bool = False) -> Optional[dict]: """Displays a pretty printed commit log graph to the terminal. - .. note:: + !!! note - For programatic access, the return_contents value can be set to true - which will retrieve relevant commit specifications as dictionary + For programatic access, the ``return_contents`` value can be set to + ``True`` which will retrieve relevant commit specifications as dictionary elements. Parameters @@ -406,6 +423,7 @@ def log(self, show_user If true and return_contents is False, show the committer of each commit on the printed log graph + Returns ------- Optional[dict] @@ -522,7 +540,6 @@ def diff(self, master: str, dev: str) -> DiffAndConflicts: res = diff.commit(dev_commit_hash=devHEAD) return res - def merge(self, message: str, master_branch: str, dev_branch: str) -> str: """Perform a merge of the changes made on two branches. @@ -563,21 +580,21 @@ def create_branch(self, name: str, base_commit: str = None) -> heads.BranchHead: checkout method to properly initialize a read (or write) enabled checkout object. - >>> from hangar import Repository - >>> repo = Repository('foo/path/to/dir') - - >>> repo.create_branch('testbranch') - BranchHead(name='testbranch', digest='b66b...a8cc') - >>> repo.list_branches() - ['master', 'testbranch'] - >>> co = repo.checkout(write=True, branch='testbranch') - >>> # add data ... - >>> newDigest = co.commit('added some stuff') - - >>> repo.create_branch('new-changes', base_commit=newDigest) - BranchHead(name='new-changes', digest='35kd...3254') - >>> repo.list_branches() - ['master', 'new-changes', 'testbranch'] + Examples + -------- + >>> from hangar import Repository + >>> repo = Repository('foo/path/to/dir') + >>> repo.create_branch('testbranch') + BranchHead(name='testbranch', digest='b66b...a8cc') + >>> repo.list_branches() + ['master', 'testbranch'] + >>> co = repo.checkout(write=True, branch='testbranch') + >>> # add data ... + >>> newDigest = co.commit('added some stuff') + >>> repo.create_branch('new-changes', base_commit=newDigest) + BranchHead(name='new-changes', digest='35kd...3254') + >>> repo.list_branches() + ['master', 'new-changes', 'testbranch'] Parameters ---------- @@ -590,7 +607,7 @@ def create_branch(self, name: str, base_commit: str = None) -> heads.BranchHead: Returns ------- - :class:`~.heads.BranchHead` + heads.BranchHead NamedTuple[str, str] with fields for ``name`` and ``digest`` of the branch created (if the operation was successful) @@ -631,7 +648,6 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc >>> from hangar import Repository >>> repo = Repository('foo/path/to/dir') - >>> repo.create_branch('first-testbranch') BranchHead(name='first-testbranch', digest='9785...56da') >>> repo.create_branch('second-testbranch') @@ -644,7 +660,6 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc >>> co.commit('added some stuff') '3l253la5hna3k3a553256nak35hq5q534kq35532' >>> co.close() - >>> repo.remove_branch('second-testbranch') BranchHead(name='second-testbranch', digest='9785...56da') @@ -655,7 +670,6 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc >>> # check out master and try to remove 'first-testbranch' >>> co = repo.checkout(write=True, branch='master') >>> co.close() - >>> repo.remove_branch('first-testbranch') Traceback (most recent call last): ... @@ -666,34 +680,36 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc >>> repo.remove_branch('first-testbranch', force_delete=True) BranchHead(name='first-testbranch', digest='9785...56da') - It is important to note that *while this method will handle all safety - checks, argument validation, and performs the operation to permanently - delete a branch name/digest pointer, **no commit refs along the history - will be deleted from the Hangar database**.* Most of the history contains - commit refs which must be safe in other branch histories, and recent - commits may have been used as the base for some new history. As such, even - if some of the latest commits leading up to a deleted branch ``HEAD`` are - orphaned (unreachable), the records (and all data added in those commits) - will remain on the disk. - - In the future, we intend to implement a garbage collector which will remove - orphan commits which have not been modified for some set amount of time - (probably on the order of a few months), but this is not implemented at the - moment. - - Should an accidental forced branch deletion occur, *it is possible to - recover* and create a new branch head pointing to the same commit. If - the commit digest of the removed branch ``HEAD`` is known, its as simple as - specifying a name and the ``base_digest`` in the normal - :meth:`create_branch` method. If the digest is unknown, it will be a - bit more work, but some of the developer facing introspection tools / - routines could be used to either manually or (with minimal effort) - programmatically find the orphan commit candidates. If you find - yourself having accidentally deleted a branch, and must get it back, - please reach out on the `Github Issues - `__ page. We'll gladly - explain more in depth and walk you through the process in any way we - can help! + !!! note + + It is important to note that *while this method will handle all safety + checks, argument validation, and performs the operation to permanently + delete a branch name/digest pointer, no commit refs along the history + will be deleted from the Hangar database.* Most of the history contains + commit refs which must be safe in other branch histories, and recent + commits may have been used as the base for some new history. As such, even + if some of the latest commits leading up to a deleted branch ``HEAD`` are + orphaned (unreachable), the records (and all data added in those commits) + will remain on the disk. + + In the future, we intend to implement a garbage collector which will remove + orphan commits which have not been modified for some set amount of time + (probably on the order of a few months), but this is not implemented at the + moment. + + Should an accidental forced branch deletion occur, *it is possible to + recover* and create a new branch head pointing to the same commit. If + the commit digest of the removed branch ``HEAD`` is known, its as simple as + specifying a name and the ``base_digest`` in the normal + [create_branch](#hangar.repository.Repository.create_branch) method. + If the digest is unknown, it will be a bit more work, but + some of the developer facing introspection tools / routines could be + used to either manually or (with minimal effort) programmatically find + the orphan commit candidates. If you find yourself having accidentally + deleted a branch, and must get it back, please reach out on the + [Github Issues](https://github.com/tensorwerk/hangar-py/issues) page. + We'll gladly explain more in depth and walk you through the process + in any way we can help! Parameters ---------- @@ -709,7 +725,7 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc Returns ------- - :class:`~.heads.BranchHead` + heads.BranchHead NamedTuple[str, str] with fields for `name` and `digest` of the branch pointer deleted. @@ -740,6 +756,15 @@ def remove_branch(self, name: str, *, force_delete: bool = False) -> heads.Branc def list_branches(self) -> List[str]: """list all branch names created in the repository. + >>> from hangar import Repository + >>> repo = Repository('foo/path/to/dir') + >>> repo.create_branch('first-testbranch') + BranchHead(name='first-testbranch', digest='9785...56da') + >>> repo.create_branch('second-testbranch') + BranchHead(name='second-testbranch', digest='9785...56da') + >>> repo.list_branches() + ['master', 'first-testbranch', 'second-testbranch'] + Returns ------- List[str] @@ -755,7 +780,7 @@ def verify_repo_integrity(self) -> bool: Runs a full cryptographic verification of repository contents in order to ensure the integrity of all data and history recorded on disk. - .. note:: + !!! info This proof may take a significant amount of time to run for repositories which: @@ -775,7 +800,7 @@ def verify_repo_integrity(self) -> bool: validated if it - and all earlier data pieces - are proven to be intact and unchanged. - Note: This does not mean that the verification is repeatedly + Note: This does not mean that the verification is repeatedly performed for every commit some piece of data is stored in. Each data piece is read from disk and verified only once, regardless of how many commits some piece of data is referenced in. @@ -791,9 +816,9 @@ def verify_repo_integrity(self) -> bool: commit in the repository history, and may take a non-trivial amount of time for repositories with thousands of commits. - While the two points above are the most time consuming operations, - there are many more checks which are performed alongside them as part - of the full verification run. + While the two points above are the most time consuming operations, + there are many more checks which are performed alongside them as part + of the full verification run. Returns ------- @@ -817,7 +842,7 @@ def verify_repo_integrity(self) -> bool: def force_release_writer_lock(self) -> bool: """Force release the lock left behind by an unclosed writer-checkout - .. warning:: + !!! danger *NEVER USE THIS METHOD IF WRITER PROCESS IS CURRENTLY ACTIVE.* At the time of writing, the implications of improper/malicious use of this are not @@ -829,9 +854,9 @@ def force_release_writer_lock(self) -> bool: before the program terminates, a new checkout with write=True will fail. The lock can only be released via a call to this method. - .. note:: + !!! note - This entire mechanism is subject to review/replacement in the future. + This entire mechanism is subject to review/replacement in the future. Returns ------- From 08b9eccedde74482eddad01179d99bc626185f91 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Thu, 13 Aug 2020 08:10:22 -0400 Subject: [PATCH 4/7] temporary --- docs/cli.md | 6 +++- docs/installation.md | 65 ++++++++++++++++++++++++++++++++++++++------ docs/quickstart.md | 4 +-- mkdocs.yml | 3 +- 4 files changed, 66 insertions(+), 12 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 57f128d9..e6dd604f 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -9,4 +9,8 @@ to be at the same level the repository was initially created in. Simply start by typing `$ hangar --help` in your terminal to get started! -![mkapi](hangar.cli) +::: mkdocs-click + :module: hangar.cli.cli + :command: hangar + +# diff --git a/docs/installation.md b/docs/installation.md index a39610b9..d84e5cb7 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,16 +11,55 @@ Pre-Built Installation ### Python Distributions If you do not already use a Python Distribution, we recommend the -[Anaconda \]() (or [Miniconda -\]()) distribution, +[Anaconda](https://www.anaconda.com/distribution/) (or +[Miniconda](https://docs.conda.io/en/latest/miniconda.html)) distribution, which supports all major operating systems (Windows, MacOSX, & the typical Linux variations). Detailed usage instructions are available [on -the anaconda website \](). +the anaconda website](https://docs.anaconda.com/anaconda/). To install Hangar via the Anaconda Distribution (from the [conda-forge -conda channel \]()): +conda channel](https://anaconda.org/conda-forge/hangar)): - conda install -c conda-forge hangar +
+ +```console +$ conda install -c conda-forge hangar + +Collecting package metadata (current_repodata.json): done +Solving environment: done + +## Package Plan ## + + environment location: /Users/rick/miniconda3/envs/new-install + + added / updated specs: + - hangar + +The following NEW packages will be INSTALLED: + + blosc pkgs/main/osx-64::blosc-1.19.0-hab81aa3_0 + c-ares conda-forge/osx-64::c-ares-1.16.1-haf1e3a3_0 + click conda-forge/noarch::click-7.1.2-pyh9f0ad1d_0 + ... + zstd conda-forge/osx-64::zstd-1.4.5-h0384e3a_1 + +Proceed ([y]/n)? y + +Preparing transaction: +---> 100% +Preparing transaction: done +Verifying transaction: +---> 100% +Verifying transaction: done +Executing transaction: +---> 100% +Executing transaction: done + +$ hangar --version +hangar, version 0.5.2 +``` + +
### Wheels (PyPi) @@ -28,13 +67,23 @@ If you have an existing python installation on your computer, pre-built Hangar Wheels can be installed via pip from the Python Package Index (PyPi): - pip install hangar +
+ +```console +$ pip install hangar +---> 100% +$ hangar --version +hangar, version 0.5.2 +``` + +
+ Source Installation ------------------- -To install Hangar from source, clone the repository from [Github -\](): +To install Hangar from source, clone the repository from +[Github](https://github.com/tensorwerk/hangar-py): git clone https://github.com/tensorwerk/hangar-py.git cd hangar-py diff --git a/docs/quickstart.md b/docs/quickstart.md index 0d3912dc..cdb25704 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -5,6 +5,6 @@ To use Hangar in a project: from hangar import Repository -Please refer to the `ref-tutorial`{.interpreted-text role="ref"} for -examples, or `ref-concepts`{.interpreted-text role="ref"} to review the +Please refer to the [Quickstart Tutorials](./Tutorial-QuickStart.ipynb) for +examples, or [Hangar Core Concepts](./concepts.md) to review the core concepts of the Hangar system. diff --git a/mkdocs.yml b/mkdocs.yml index b031c11f..7193a90d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,6 +31,7 @@ markdown_extensions: - admonition - pymdownx.details - pymdownx.superfences + - mkdocs-click - pymdownx.highlight: use_pygments: true @@ -63,7 +64,7 @@ nav: - Dataloaders for Machine Learning (Tensorflow & PyTorch): Tutorial-Dataloader.ipynb - Real World Quick Start Tutorial: Tutorial-RealQuickStart.ipynb - Design: design.md - #- CLI: cli.md + - CLI: cli.md - Externals: externals.md - FAQ: faq.md - Backends: From 51e996f3abcf912b48cb102d2826c20f012f6be6 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Thu, 13 Aug 2020 11:30:11 -0400 Subject: [PATCH 5/7] tmp commit to share --- docs/cli.md | 4 ++- setup.py | 2 +- src/hangar/cli/cli.py | 57 ++++++++++++++++++--------------- src/hangar/mixins/datasetget.py | 7 ++-- src/hangar/mixins/recorditer.py | 16 +++++---- 5 files changed, 50 insertions(+), 36 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index e6dd604f..cbfb8766 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -11,6 +11,8 @@ started! ::: mkdocs-click :module: hangar.cli.cli - :command: hangar + :command: init + :depth: 1 + # diff --git a/setup.py b/setup.py index 84904188..2ce297ad 100644 --- a/setup.py +++ b/setup.py @@ -146,7 +146,7 @@ def run(self): language="c") __extensions.append(ext) -with open('README.rst') as f: +with open('README.md') as f: README_RST = f.read() SHORT_DESCRIPTION = ( diff --git a/src/hangar/cli/cli.py b/src/hangar/cli/cli.py index f5ff4c9b..8216fdca 100644 --- a/src/hangar/cli/cli.py +++ b/src/hangar/cli/cli.py @@ -292,9 +292,9 @@ def fetch_records(repo: Repository, remote, branch): @main.command(name='fetch-data') @click.argument('remote', nargs=1, required=True) @click.argument('startpoint', nargs=1, required=True) -@click.option('--column', '-d', multiple=True, required=False, default=None, +@click.option('--column', '-d', multiple=True, required=False, default='', help='specify any number of column keys to fetch data for.') -@click.option('--nbytes', '-n', default=None, required=False, +@click.option('--nbytes', '-n', default='', required=False, help='total amount of data to retrieve in MB/GB.') @click.option('--all-history', '-a', 'all_', is_flag=True, default=False, required=False, help='Retrieve data referenced in every parent commit accessible to the STARTPOINT') @@ -310,7 +310,7 @@ def fetch_data(repo: Repository, remote, startpoint, column, nbytes, all_): from hangar.records.heads import get_staging_branch_head from hangar.utils import parse_bytes - if startpoint is None: + if startpoint == '': branch = get_staging_branch_head(repo._env.branchenv) commit = get_branch_head_commit(repo._env.branchenv, branch) elif startpoint in repo.list_branches(): @@ -319,9 +319,12 @@ def fetch_data(repo: Repository, remote, startpoint, column, nbytes, all_): commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(f'Fetching data for commit: {commit}') - try: - max_nbytes = parse_bytes(nbytes) - except AttributeError: + if nbytes != '': + try: + max_nbytes = parse_bytes(nbytes) + except AttributeError: + max_nbytes = None + else: max_nbytes = None if len(column) == 0: column = None @@ -393,7 +396,7 @@ def remove_remote(repo: Repository, name): @main.command() @click.argument('dev', nargs=1, required=True) -@click.argument('master', nargs=1, required=False, default=None) +@click.argument('master', nargs=1, required=False, default='') @pass_repo def diff(repo: Repository, dev, master): """Display diff of DEV commit/branch to MASTER commit/branch. @@ -412,7 +415,7 @@ def diff(repo: Repository, dev, master): if dev not in repo.list_branches(): dev = expand_short_commit_digest(repo._env.refenv, dev) - if master is None: + if master == '': master = get_staging_branch_head(repo._env.branchenv) elif master not in repo.list_branches(): master = expand_short_commit_digest(repo._env.refenv, master) @@ -422,7 +425,7 @@ def diff(repo: Repository, dev, master): click.echo(buf.getvalue()) @main.command() -@click.argument('startpoint', nargs=1, required=False) +@click.argument('startpoint', nargs=1, required=False, default='') @pass_repo def summary(repo: Repository, startpoint): """Display content summary at STARTPOINT (short-digest or branch). @@ -434,7 +437,7 @@ def summary(repo: Repository, startpoint): """ from hangar.records.commiting import expand_short_commit_digest - if startpoint is None: + if startpoint == '': click.echo(repo.summary()) elif startpoint in repo.list_branches(): click.echo(repo.summary(branch=startpoint)) @@ -444,7 +447,7 @@ def summary(repo: Repository, startpoint): @main.command() -@click.argument('startpoint', required=False, default=None) +@click.argument('startpoint', required=False, default='') @pass_repo def log(repo: Repository, startpoint): """Display commit graph starting at STARTPOINT (short-digest or name) @@ -454,7 +457,7 @@ def log(repo: Repository, startpoint): """ from hangar.records.commiting import expand_short_commit_digest - if startpoint is None: + if startpoint == '': click.echo(repo.log()) elif startpoint in repo.list_branches(): click.echo(repo.log(branch=startpoint)) @@ -500,7 +503,7 @@ def branch_list(repo: Repository): @branch.command(name='create') @click.argument('name', nargs=1, required=True) -@click.argument('startpoint', nargs=1, default=None, required=False) +@click.argument('startpoint', nargs=1, default='', required=False) @pass_repo def branch_create(repo: Repository, name, startpoint): """Create a branch with NAME at STARTPOINT (short-digest or branch) @@ -518,7 +521,7 @@ def branch_create(repo: Repository, name, startpoint): raise click.ClickException(e) try: - if startpoint is None: + if startpoint == '': branch = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch) elif startpoint in branch_names: @@ -612,8 +615,8 @@ def server(overwrite, ip, port, timeout): required=True, type=click.Path(exists=True, dir_okay=True, file_okay=True, readable=True, resolve_path=True)) -@click.option('--branch', default=None, help='branch to import data') -@click.option('--plugin', default=None, help='override auto-infered plugin') +@click.option('--branch', default='', help='branch to import data') +@click.option('--plugin', default='', help='override auto-infered plugin') @click.option('--overwrite', is_flag=True, help='overwrite data samples with the same name as the imported data file ') @pass_repo @@ -630,7 +633,7 @@ def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): from hangar.records.heads import get_staging_branch_head kwargs = parse_custom_arguments(ctx.args) - if branch is None: + if branch == '': branch = get_staging_branch_head(repo._env.branchenv) elif branch not in repo.list_branches(): raise click.ClickException(f'Branch name: {branch} does not exist, Exiting.') @@ -644,7 +647,8 @@ def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): with active_aset as aset, click.progressbar(files) as filesBar: for f in filesBar: ext = ''.join(f.suffixes).strip('.') # multi-suffix files (tar.bz2) - loaded = external.load(f, plugin=plugin, extension=ext, **kwargs) + _plugin = None if plugin == '' else plugin + loaded = external.load(f, plugin=_plugin, extension=ext, **kwargs) if not isinstance(loaded, GeneratorType): loaded = [loaded] for arr, fname in loaded: @@ -663,7 +667,7 @@ def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): @main.command(name='export', context_settings=dict(allow_extra_args=True, ignore_unknown_options=True, )) @click.argument('column', nargs=1, required=True) -@click.argument('startpoint', nargs=1, default=None, required=False) +@click.argument('startpoint', nargs=1, default='', required=False) @click.option('-o', '--out', 'outdir', nargs=1, required=False, @@ -673,7 +677,7 @@ def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): help="Directory to export data") @click.option('-s', '--sample', nargs=1, - default=None, + default='', type=StrOrIntType(), help=('Sample name to export. Default implementation is to interpret all input ' 'names as string type. As a column can contain samples with both ``str`` ' @@ -713,7 +717,7 @@ def export_data(ctx, repo: Repository, column, outdir, startpoint, sample, forma if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) - elif startpoint: + elif startpoint != '': base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) else: branch_name = get_staging_branch_head(repo._env.branchenv) @@ -722,7 +726,7 @@ def export_data(ctx, repo: Repository, column, outdir, startpoint, sample, forma co = repo.checkout(commit=base_commit) try: aset = co.columns.get(column) - sampleNames = [sample] if sample is not None else list(aset.keys()) + sampleNames = [sample] if sample != '' else list(aset.keys()) extension = format_.lstrip('.') if format_ else None with aset, click.progressbar(sampleNames) as sNamesBar: for sampleN in sNamesBar: @@ -742,9 +746,9 @@ def export_data(ctx, repo: Repository, column, outdir, startpoint, sample, forma context_settings=dict(allow_extra_args=True, ignore_unknown_options=True, )) @click.argument('column', nargs=1, type=str, required=True) @click.argument('sample', nargs=1, type=StrOrIntType(), required=True) -@click.argument('startpoint', nargs=1, default=None, required=False) +@click.argument('startpoint', nargs=1, default='', required=False) @click.option('-f', '--format', 'format_', required=False, help='File format of output file') -@click.option('--plugin', default=None, help='Plugin name to use instead of auto-inferred plugin') +@click.option('--plugin', default='', help='Plugin name to use instead of auto-inferred plugin') @pass_repo @click.pass_context def view_data(ctx, repo: Repository, column, sample, startpoint, format_, plugin): @@ -757,7 +761,7 @@ def view_data(ctx, repo: Repository, column, sample, startpoint, format_, plugin kwargs = parse_custom_arguments(ctx.args) if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) - elif startpoint: + elif startpoint != '': base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) else: branch_name = get_staging_branch_head(repo._env.branchenv) @@ -769,7 +773,8 @@ def view_data(ctx, repo: Repository, column, sample, startpoint, format_, plugin extension = format_.lstrip('.') if format_ else None data = aset[sample] try: - external.show(data, plugin=plugin, extension=extension, **kwargs) + _plugin = None if plugin == '' else plugin + external.show(data, plugin=_plugin, extension=extension, **kwargs) except Exception as e: raise click.ClickException(e) except KeyError as e: diff --git a/src/hangar/mixins/datasetget.py b/src/hangar/mixins/datasetget.py index f0397b4b..f2e1a88a 100644 --- a/src/hangar/mixins/datasetget.py +++ b/src/hangar/mixins/datasetget.py @@ -149,8 +149,11 @@ def get(self, keys, default=None, except_missing=False): """ return self._get_in(keys, default, except_missing) - def _get_in(self, keys, default=None, except_missing=False, - *, _EXCEPTION_CLASSES = (KeyError, IndexError, TypeError)): + def _get_in( + self, keys, default=None, except_missing=False, + *, + _EXCEPTION_CLASSES = (KeyError, IndexError, TypeError) + ): """Internal method to get data from columns within a nested set of dicts. Parameters diff --git a/src/hangar/mixins/recorditer.py b/src/hangar/mixins/recorditer.py index ef86ddfb..a2e9d5e5 100644 --- a/src/hangar/mixins/recorditer.py +++ b/src/hangar/mixins/recorditer.py @@ -5,21 +5,25 @@ class CursorRangeIterator: @staticmethod - def cursor_range_iterator(datatxn: lmdb.Transaction, startRangeKey: bytes, keys: bool, values: bool - ) -> Iterable[Union[Tuple[bytes], Tuple[bytes, bytes]]]: + def cursor_range_iterator( + datatxn: lmdb.Transaction, + startRangeKey: bytes, + keys: bool, + values: bool + ) -> Iterable[Union[Tuple[bytes], Tuple[bytes, bytes]]]: """Common method used to implement cursor range iterators Parameters ---------- - datatxn : lmdb.Transaction + datatxn open database transaction to read values from - startRangeKey : bytes + startRangeKey range in which to iterate cursor over until end of db or out of lexicographic range. - keys : bool, optional + keys If True, yield metadata keys encountered, if False only values are returned. By default, True. - values : bool, optional + values If True, yield metadata hash values encountered, if False only keys are returned. By default, True. From d48f2f26d01e803a83825c252cfc9d1bc8bc01e0 Mon Sep 17 00:00:00 2001 From: alessiamarcolini <98marcolini@gmail.com> Date: Mon, 17 Aug 2020 16:03:06 +0200 Subject: [PATCH 6/7] Modify how Termy renders conda installation --- docs/installation.md | 9 ++++++--- docs/js/custom.js | 6 +++--- docs/js/termynal.js | 12 ++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index d84e5cb7..bdc4f4a7 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -26,15 +26,17 @@ conda channel](https://anaconda.org/conda-forge/hangar)): $ conda install -c conda-forge hangar Collecting package metadata (current_repodata.json): done -Solving environment: done +// Solving environment: done + ## Package Plan ## environment location: /Users/rick/miniconda3/envs/new-install added / updated specs: - hangar - + +// The following NEW packages will be INSTALLED: blosc pkgs/main/osx-64::blosc-1.19.0-hab81aa3_0 @@ -43,7 +45,8 @@ The following NEW packages will be INSTALLED: ... zstd conda-forge/osx-64::zstd-1.4.5-h0384e3a_1 -Proceed ([y]/n)? y +// +y Preparing transaction: ---> 100% diff --git a/docs/js/custom.js b/docs/js/custom.js index fe2e09dc..f273987f 100644 --- a/docs/js/custom.js +++ b/docs/js/custom.js @@ -61,7 +61,7 @@ function setupTermynal() { }); } else if (line.startsWith("// ")) { saveBuffer(); - const value = "💬 " + line.replace("// ", "").trimEnd(); + const value = " " + line.replace("// ", "").trimEnd(); useLines.push({ value: value, class: "termynal-comment", @@ -90,7 +90,7 @@ function setupTermynal() { const termynal = new Termynal(div, { lineData: useLines, noInit: true, - lineDelay: 500 + lineDelay: 2000 }); termynals.push(termynal); }); @@ -111,4 +111,4 @@ function setupTermynal() { } setupTermynal() -document.getElementsByClassName('gitter-open-chat-button')[0].style.backgroundColor="#7f85c0" +document.getElementsByClassName('gitter-open-chat-button')[0].style.backgroundColor = "#7f85c0" diff --git a/docs/js/termynal.js b/docs/js/termynal.js index 4ac32708..54dde147 100644 --- a/docs/js/termynal.js +++ b/docs/js/termynal.js @@ -40,7 +40,7 @@ class Termynal { || parseFloat(this.container.getAttribute(`${this.pfx}-progressLength`)) || 40; this.progressChar = options.progressChar || this.container.getAttribute(`${this.pfx}-progressChar`) || '█'; - this.progressPercent = options.progressPercent + this.progressPercent = options.progressPercent || parseFloat(this.container.getAttribute(`${this.pfx}-progressPercent`)) || 100; this.cursor = options.cursor || this.container.getAttribute(`${this.pfx}-cursor`) || '▋'; @@ -190,8 +190,8 @@ class Termynal { const progressChar = line.getAttribute(`${this.pfx}-progressChar`) || this.progressChar; const chars = progressChar.repeat(progressLength); - const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`) - || this.progressPercent; + const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`) + || this.progressPercent; line.textContent = ''; this.container.appendChild(line); @@ -199,9 +199,9 @@ class Termynal { await this._wait(this.typeDelay); const percent = Math.round(i / chars.length * 100); line.textContent = `${chars.slice(0, i)} ${percent}%`; - if (percent>progressPercent) { - break; - } + if (percent > progressPercent) { + break; + } } } From 118808bfc080f1fec8dd872cba932111c276ea47 Mon Sep 17 00:00:00 2001 From: alessiamarcolini <98marcolini@gmail.com> Date: Mon, 14 Sep 2020 22:34:33 +0200 Subject: [PATCH 7/7] Remove sphinx reference and requirements file --- .readthedocs.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 96a1acef..3c681fb7 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,10 +5,6 @@ # Required version: 2 -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/conf.py - # Optionally build your docs in additional formats such as PDF and ePub formats: all @@ -16,7 +12,6 @@ formats: all python: version: 3.7 install: - - requirements: docs/requirements.txt - method: pip path: . - method: setuptools