Skip to content

Commit

Permalink
Format Version 1: Adds crc32c checks. (#8)
Browse files Browse the repository at this point in the history
* feat: FORMAT_VERSION=1, adds crc32 check value to format

Implementation is backwards compatible with format version 0 though
the format is different.

* docs: describe format version 1

* test: show validationerror is raised if bits corrupted

* feat: allow disabling crc checks
  • Loading branch information
william-silversmith authored Nov 16, 2021
1 parent e25a6af commit 8e57a29
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 7 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ The byte string format consists of a 16 byte header, an index, and a series of (
HEADER|INDEX|DATA_REGION
```

| Format Version | description |
|----------------|----------------------------------------|
| 0 | Initial Release |
| 1 | Adds crc32c check values to each item. |

### Header

```
Expand All @@ -93,7 +98,7 @@ The index can be consulted by conducting an Eytzinger binary search over the lab

### Data Region

The data objects are serialized to bytes and compressed individually if the header indicates they should be. They are then concatenated in the same order the index specifies.
The data objects are serialized to bytes and compressed individually if the header indicates they should be. They are then concatenated in the same order the index specifies. The last four bytes are a crc32c check value that was added in format version 1.

## Versus Flexbuffers

Expand Down
21 changes: 20 additions & 1 deletion automated_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import numpy as np

from mapbuffer import MapBuffer, HEADER_LENGTH
from mapbuffer import ValidationError, MapBuffer, HEADER_LENGTH

@pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd", "lzma"))
def test_empty(compress):
Expand Down Expand Up @@ -56,6 +56,25 @@ def test_full(compress):

assert len(mbuf.buffer) > HEADER_LENGTH

@pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd"))
def test_crc32c(compress):
data = {
1: b"hello",
2: b"world",
}
mbuf = MapBuffer(data, compress=compress)

idx = mbuf.buffer.index(b"hello")
buf = list(mbuf.buffer)
buf[idx] = ord(b'H')
mbuf.buffer = bytes(buf)

try:
mbuf[1]
assert False
except ValidationError:
pass

@pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd"))
def test_mmap_access(compress):
data = {
Expand Down
25 changes: 20 additions & 5 deletions mapbuffer/mapbuffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,26 @@
from .lib import nvl
from . import compression

import crc32c
import numpy as np

import mapbufferaccel

FORMAT_VERSION = 0
FORMAT_VERSION = 1
MAGIC_NUMBERS = b"mapbufr"
HEADER_LENGTH = 16

class MapBuffer:
"""Represents a usable int->bytes dictionary as a byte string."""
__slots__ = (
"data", "tobytesfn", "frombytesfn",
"dtype", "buffer", "_header",
"_index", "_compress"
"dtype", "buffer", "check_crc",
"_header", "_index", "_compress"
)
def __init__(
self, data=None, compress=None,
tobytesfn=None, frombytesfn=None
tobytesfn=None, frombytesfn=None,
check_crc=True
):
"""
data: dict (int->byte serializable object) or bytes
Expand All @@ -41,6 +43,7 @@ def __init__(
self.frombytesfn = frombytesfn
self.dtype = np.uint64
self.buffer = None
self.check_crc = check_crc

self._header = None
self._index = None
Expand Down Expand Up @@ -133,6 +136,16 @@ def getindex(self, i):
else:
value = self.buffer[offset:]

if self.format_version == 1:
stored_check_value = int.from_bytes(value[-4:], byteorder='little')
value = value[:-4]
if self.check_crc:
retrieved_check_value = crc32c.crc32c(value)
if retrieved_check_value != stored_check_value:
raise ValidationError(
f"Label {i} failed its crc32c check. Stored: {stored_check_value} Computed: {retrieved_check_value}"
)

encoding = self.compress
if encoding:
value = compression.decompress(value, encoding, str(index[i,0]))
Expand Down Expand Up @@ -213,6 +226,8 @@ def dict2buf(self, data, compress=None, tobytesfn=None):
label: compression.compress(tobytesfn(val), method=compress)
for label, val in data.items()
}
for label in bytes_data:
bytes_data[label] += crc32c.crc32c(bytes_data[label]).to_bytes(4, byteorder='little')

data_region = b"".join(
( bytes_data[label] for label in labels )
Expand Down Expand Up @@ -244,7 +259,7 @@ def validate_buffer(buf):
if magic != MAGIC_NUMBERS:
raise ValidationError(f"Magic number mismatch. Expected: {MAGIC_NUMBERS} Got: {magic}")

if mapbuf.format_version not in (0,):
if mapbuf.format_version not in (0,1):
raise ValidationError(f"Unsupported format version. Got: {mapbuf.format_version}")

if mapbuf.compress not in compression.COMPRESSION_TYPES:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
brotli
crc32c
deflate>=0.2.0
numpy
tqdm
Expand Down

0 comments on commit 8e57a29

Please sign in to comment.