Skip to content

Commit

Permalink
Merge pull request #62 from ThomasWaldmann/chunker-params
Browse files Browse the repository at this point in the history
Chunker params, fixes #16
  • Loading branch information
ThomasWaldmann committed Jun 21, 2015
2 parents 44ec864 + 41a37e7 commit a487e16
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 33 deletions.
30 changes: 17 additions & 13 deletions borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int

ITEMS_BUFFER = 1024 * 1024
CHUNK_MIN = 1024
CHUNK_MAX = 10 * 1024 * 1024
WINDOW_SIZE = 0xfff
CHUNK_MASK = 0xffff

ZEROS = b'\0' * CHUNK_MAX
CHUNK_MIN_EXP = 10 # 2**10 == 1kiB
CHUNK_MAX_EXP = 23 # 2**23 == 8MiB
HASH_WINDOW_SIZE = 0xfff # 4095B
HASH_MASK_BITS = 16 # results in ~64kiB chunks statistically

# defaults, use --chunker-params to override
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)

utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
Expand Down Expand Up @@ -69,12 +71,12 @@ def fetch_many(self, ids, is_preloaded=False):
class ChunkBuffer:
BUFFER_SIZE = 1 * 1024 * 1024

def __init__(self, key):
def __init__(self, key, chunker_params=CHUNKER_PARAMS):
self.buffer = BytesIO()
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
self.chunks = []
self.key = key
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)

def add(self, item):
self.buffer.write(self.packer.pack(StableDict(item)))
Expand Down Expand Up @@ -104,8 +106,8 @@ def is_full(self):

class CacheChunkBuffer(ChunkBuffer):

def __init__(self, cache, key, stats):
super(CacheChunkBuffer, self).__init__(key)
def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
super(CacheChunkBuffer, self).__init__(key, chunker_params)
self.cache = cache
self.stats = stats

Expand All @@ -127,7 +129,8 @@ class IncompatibleFilesystemEncodingError(Error):


def __init__(self, repository, key, manifest, name, cache=None, create=False,
checkpoint_interval=300, numeric_owner=False, progress=False):
checkpoint_interval=300, numeric_owner=False, progress=False,
chunker_params=CHUNKER_PARAMS):
self.cwd = os.getcwd()
self.key = key
self.repository = repository
Expand All @@ -142,8 +145,8 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
self.numeric_owner = numeric_owner
self.pipeline = DownloadPipeline(self.repository, self.key)
if create:
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
if name in manifest.archives:
raise self.AlreadyExists(name)
self.last_checkpoint = time.time()
Expand All @@ -158,6 +161,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
raise self.DoesNotExist(name)
info = self.manifest.archives[name]
self.load(info[b'id'])
self.zeros = b'\0' * (1 << chunker_params[1])

def _load_meta(self, id):
data = self.key.decrypt(id, self.repository.get(id))
Expand Down Expand Up @@ -286,7 +290,7 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
with open(path, 'wb') as fd:
ids = [c[0] for c in item[b'chunks']]
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
if sparse and ZEROS.startswith(data):
if sparse and self.zeros.startswith(data):
# all-zero chunk: create a hole in a sparse file
fd.seek(len(data), 1)
else:
Expand Down
11 changes: 8 additions & 3 deletions borg/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
import traceback

from . import __version__
from .archive import Archive, ArchiveChecker
from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS
from .repository import Repository
from .cache import Cache
from .key import key_creator
from .helpers import Error, location_validator, format_time, format_file_size, \
format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
is_cachedir, bigint_to_int
is_cachedir, bigint_to_int, ChunkerParams
from .remote import RepositoryServer, RemoteRepository


Expand Down Expand Up @@ -104,7 +104,8 @@ def do_create(self, args):
cache = Cache(repository, key, manifest, do_files=args.cache_files)
archive = Archive(repository, key, manifest, args.archive.archive, cache=cache,
create=True, checkpoint_interval=args.checkpoint_interval,
numeric_owner=args.numeric_owner, progress=args.progress)
numeric_owner=args.numeric_owner, progress=args.progress,
chunker_params=args.chunker_params)
# Add cache dir to inode_skip list
skip_inodes = set()
try:
Expand Down Expand Up @@ -625,6 +626,10 @@ def run(self, args=None):
metavar='yyyy-mm-ddThh:mm:ss',
help='manually specify the archive creation date/time (UTC). '
'alternatively, give a reference file/directory.')
subparser.add_argument('--chunker-params', dest='chunker_params',
type=ChunkerParams, default=CHUNKER_PARAMS,
metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
subparser.add_argument('archive', metavar='ARCHIVE',
type=location_validator(archive=True),
help='archive to create')
Expand Down
7 changes: 5 additions & 2 deletions borg/chunker.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ cdef extern from "_chunker.c":
cdef class Chunker:
cdef _Chunker *chunker

def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
def __cinit__(self, seed, chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
hash_mask = (1 << hash_mask_bits) - 1
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)

def chunkify(self, fd, fh=-1):
"""
Expand Down
5 changes: 5 additions & 0 deletions borg/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,11 @@ def timestamp(s):
raise ValueError


def ChunkerParams(s):
window_size, chunk_mask, chunk_min, chunk_max = s.split(',')
return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max)


def is_cachedir(path):
"""Determines whether the specified path is a cache directory (and
therefore should potentially be excluded from the backup) according to
Expand Down
4 changes: 2 additions & 2 deletions borg/testsuite/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from hashlib import sha256

from .. import xattr
from ..archive import Archive, ChunkBuffer, CHUNK_MAX
from ..archive import Archive, ChunkBuffer, CHUNK_MAX_EXP
from ..archiver import Archiver
from ..cache import Cache
from ..crypto import bytes_to_long, num_aes_blocks
Expand Down Expand Up @@ -213,7 +213,7 @@ def test_sparse_file(self):
sparse_support = sys.platform != 'darwin'
filename = os.path.join(self.input_path, 'sparse')
content = b'foobar'
hole_size = 5 * CHUNK_MAX # 5 full chunker buffers
hole_size = 5 * (1 << CHUNK_MAX_EXP) # 5 full chunker buffers
with open(filename, 'wb') as fd:
# create a file that has a hole at the beginning and end (if the
# OS and filesystem supports sparse files)
Expand Down
26 changes: 13 additions & 13 deletions borg/testsuite/chunker.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
from io import BytesIO

from ..chunker import Chunker, buzhash, buzhash_update
from ..archive import CHUNK_MAX
from ..archive import CHUNK_MAX_EXP
from . import BaseTestCase


class ChunkerTestCase(BaseTestCase):

def test_chunkify(self):
data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])

def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
Expand Down
116 changes: 116 additions & 0 deletions docs/misc/create_chunker-params.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
About borg create --chunker-params
==================================

--chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE

CHUNK_MIN_EXP and CHUNK_MAX_EXP give the exponent N of the 2^N minimum and
maximum chunk size. Required: CHUNK_MIN_EXP < CHUNK_MAX_EXP.

Defaults: 10 (2^10 == 1KiB) minimum, 23 (2^23 == 8MiB) maximum.

HASH_MASK_BITS is the number of least-significant bits of the rolling hash
that need to be zero to trigger a chunk cut.
Recommended: CHUNK_MIN_EXP + X <= HASH_MASK_BITS <= CHUNK_MAX_EXP - X, X >= 2
(this allows the rolling hash some freedom to make its cut at a place
determined by the windows contents rather than the min/max. chunk size).

Default: 16 (statistically, chunks will be about 2^16 == 64kiB in size)

HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation.
Default: 4095B


Trying it out
=============

I backed up a VM directory to demonstrate how different chunker parameters
influence repo size, index size / chunk count, compression, deduplication.

repo-sm: ~64kiB chunks (16 bits chunk mask), min chunk size 1kiB (2^10B)
(these are attic / borg 0.23 internal defaults)

repo-lg: ~1MiB chunks (20 bits chunk mask), min chunk size 64kiB (2^16B)

repo-xl: 8MiB chunks (2^23B max chunk size), min chunk size 64kiB (2^16B).
The chunk mask bits was set to 31, so it (almost) never triggers.
This degrades the rolling hash based dedup to a fixed-offset dedup
as the cutting point is now (almost) always the end of the buffer
(at 2^23B == 8MiB).

The repo index size is an indicator for the RAM needs of Borg.
In this special case, the total RAM needs are about 2.1x the repo index size.
You see index size of repo-sm is 16x larger than of repo-lg, which corresponds
to the ratio of the different target chunk sizes.

Note: RAM needs were not a problem in this specific case (37GB data size).
But just imagine, you have 37TB of such data and much less than 42GB RAM,
then you'ld definitely want the "lg" chunker params so you only need
2.6GB RAM. Or even bigger chunks than shown for "lg" (see "xl").

You also see compression works better for larger chunks, as expected.
Duplication works worse for larger chunks, also as expected.

small chunks
============

$ borg info /extra/repo-sm::1

Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 10,23,16,4095 /extra/repo-sm::1 /home/tw/win
Number of files: 3

Original size Compressed size Deduplicated size
This archive: 37.12 GB 14.81 GB 12.18 GB
All archives: 37.12 GB 14.81 GB 12.18 GB

Unique chunks Total chunks
Chunk index: 378374 487316

$ ls -l /extra/repo-sm/index*

-rw-rw-r-- 1 tw tw 20971538 Jun 20 23:39 index.2308

$ du -sk /extra/repo-sm
11930840 /extra/repo-sm

large chunks
============

$ borg info /extra/repo-lg::1

Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,20,4095 /extra/repo-lg::1 /home/tw/win
Number of files: 3

Original size Compressed size Deduplicated size
This archive: 37.10 GB 14.60 GB 13.38 GB
All archives: 37.10 GB 14.60 GB 13.38 GB

Unique chunks Total chunks
Chunk index: 25889 29349

$ ls -l /extra/repo-lg/index*

-rw-rw-r-- 1 tw tw 1310738 Jun 20 23:10 index.2264

$ du -sk /extra/repo-lg
13073928 /extra/repo-lg

xl chunks
=========

(borg-env)tw@tux:~/w/borg$ borg info /extra/repo-xl::1
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,31,4095 /extra/repo-xl::1 /home/tw/win
Number of files: 3

Original size Compressed size Deduplicated size
This archive: 37.10 GB 14.59 GB 14.59 GB
All archives: 37.10 GB 14.59 GB 14.59 GB

Unique chunks Total chunks
Chunk index: 4319 4434

$ ls -l /extra/repo-xl/index*
-rw-rw-r-- 1 tw tw 327698 Jun 21 00:52 index.2011

$ du -sk /extra/repo-xl/
14253464 /extra/repo-xl/

3 changes: 3 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Examples
NAME="root-`date +%Y-%m-%d`"
$ borg create /mnt/backup::$NAME / --do-not-cross-mountpoints

# Backup huge files with little chunk management overhead
$ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs


.. include:: usage/extract.rst.inc

Expand Down

5 comments on commit a487e16

@STRML
Copy link

@STRML STRML commented on a487e16 Sep 30, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, this is a great new feature over Attic. I hope you can get it backported, but if not, it at least makes a very good line-item for a "Why Borg?" article.

@RonnyPfannschmidt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If attic was propperly accepting contributions Borg wouldn't exist to begin with

@ThomasWaldmann
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea with that "why borg" article. :)

Maybe it could be a section in the docs, giving a high-level overview with the changes compared to attic.

@STRML
Copy link

@STRML STRML commented on a487e16 Sep 30, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think that would be a good idea. Being a newcomer to the project, the only concrete information I could find at first glance was that you wanted to be able to break backcompat more often. This is not the most enticing thing for someone looking for backup software.

After further reading I understand your motivations, and #5 really shows how far Borg has come. An article like this could take focus off of the clickbaity "he wants to break my backups!" message that has been circulating around and onto the message that Attic is mostly unmaintained and this is taking over development.

@ThomasWaldmann
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #224 .

Please sign in to comment.