Merge pull request #62 from ThomasWaldmann/chunker-params

Chunker params, fixes #16
borgbackup · Jun 21, 2015 · a487e16 · a487e16 · STRML · Sep 30, 2015
2 parents 44ec864 + 41a37e7
commit a487e16
Show file tree

Hide file tree

Showing 8 changed files with 169 additions and 33 deletions.
diff --git a/borg/archive.py b/borg/archive.py
@@ -21,12 +21,14 @@
     Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int
 
 ITEMS_BUFFER = 1024 * 1024
-CHUNK_MIN = 1024
-CHUNK_MAX = 10 * 1024 * 1024
-WINDOW_SIZE = 0xfff
-CHUNK_MASK = 0xffff
 
-ZEROS = b'\0' * CHUNK_MAX
+CHUNK_MIN_EXP = 10  # 2**10 == 1kiB
+CHUNK_MAX_EXP = 23  # 2**23 == 8MiB
+HASH_WINDOW_SIZE = 0xfff  # 4095B
+HASH_MASK_BITS = 16  # results in ~64kiB chunks statistically
+
+# defaults, use --chunker-params to override
+CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 
 utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
 utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
@@ -69,12 +71,12 @@ def fetch_many(self, ids, is_preloaded=False):
 class ChunkBuffer:
     BUFFER_SIZE = 1 * 1024 * 1024
 
-    def __init__(self, key):
+    def __init__(self, key, chunker_params=CHUNKER_PARAMS):
         self.buffer = BytesIO()
         self.packer = msgpack.Packer(unicode_errors='surrogateescape')
         self.chunks = []
         self.key = key
-        self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
+        self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
 
     def add(self, item):
         self.buffer.write(self.packer.pack(StableDict(item)))
@@ -104,8 +106,8 @@ def is_full(self):
 
 class CacheChunkBuffer(ChunkBuffer):
 
-    def __init__(self, cache, key, stats):
-        super(CacheChunkBuffer, self).__init__(key)
+    def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
+        super(CacheChunkBuffer, self).__init__(key, chunker_params)
         self.cache = cache
         self.stats = stats
 
@@ -127,7 +129,8 @@ class IncompatibleFilesystemEncodingError(Error):
 
 
     def __init__(self, repository, key, manifest, name, cache=None, create=False,
-                 checkpoint_interval=300, numeric_owner=False, progress=False):
+                 checkpoint_interval=300, numeric_owner=False, progress=False,
+                 chunker_params=CHUNKER_PARAMS):
         self.cwd = os.getcwd()
         self.key = key
         self.repository = repository
@@ -142,8 +145,8 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
         self.numeric_owner = numeric_owner
         self.pipeline = DownloadPipeline(self.repository, self.key)
         if create:
-            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
-            self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
+            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
+            self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
             if name in manifest.archives:
                 raise self.AlreadyExists(name)
             self.last_checkpoint = time.time()
@@ -158,6 +161,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
                 raise self.DoesNotExist(name)
             info = self.manifest.archives[name]
             self.load(info[b'id'])
+            self.zeros = b'\0' * (1 << chunker_params[1])
 
     def _load_meta(self, id):
         data = self.key.decrypt(id, self.repository.get(id))
@@ -286,7 +290,7 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
                 with open(path, 'wb') as fd:
                     ids = [c[0] for c in item[b'chunks']]
                     for data in self.pipeline.fetch_many(ids, is_preloaded=True):
-                        if sparse and ZEROS.startswith(data):
+                        if sparse and self.zeros.startswith(data):
                             # all-zero chunk: create a hole in a sparse file
                             fd.seek(len(data), 1)
                         else:

diff --git a/borg/archiver.py b/borg/archiver.py
@@ -13,15 +13,15 @@
 import traceback
 
 from . import __version__
-from .archive import Archive, ArchiveChecker
+from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS
 from .repository import Repository
 from .cache import Cache
 from .key import key_creator
 from .helpers import Error, location_validator, format_time, format_file_size, \
     format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
     get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
     Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
-    is_cachedir, bigint_to_int
+    is_cachedir, bigint_to_int, ChunkerParams
 from .remote import RepositoryServer, RemoteRepository
 
 
@@ -104,7 +104,8 @@ def do_create(self, args):
         cache = Cache(repository, key, manifest, do_files=args.cache_files)
         archive = Archive(repository, key, manifest, args.archive.archive, cache=cache,
                           create=True, checkpoint_interval=args.checkpoint_interval,
-                          numeric_owner=args.numeric_owner, progress=args.progress)
+                          numeric_owner=args.numeric_owner, progress=args.progress,
+                          chunker_params=args.chunker_params)
         # Add cache dir to inode_skip list
         skip_inodes = set()
         try:
@@ -625,6 +626,10 @@ def run(self, args=None):
                                metavar='yyyy-mm-ddThh:mm:ss',
                                help='manually specify the archive creation date/time (UTC). '
                                     'alternatively, give a reference file/directory.')
+        subparser.add_argument('--chunker-params', dest='chunker_params',
+                               type=ChunkerParams, default=CHUNKER_PARAMS,
+                               metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
+                               help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
         subparser.add_argument('archive', metavar='ARCHIVE',
                                type=location_validator(archive=True),
                                help='archive to create')

diff --git a/borg/chunker.pyx b/borg/chunker.pyx
@@ -20,8 +20,11 @@ cdef extern from "_chunker.c":
 cdef class Chunker:
     cdef _Chunker *chunker
 
-    def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
-        self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
+    def __cinit__(self, seed, chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size):
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        hash_mask = (1 << hash_mask_bits) - 1
+        self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
 
     def chunkify(self, fd, fh=-1):
         """

diff --git a/borg/helpers.py b/borg/helpers.py
@@ -313,6 +313,11 @@ def timestamp(s):
         raise ValueError
 
 
+def ChunkerParams(s):
+    window_size, chunk_mask, chunk_min, chunk_max = s.split(',')
+    return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max)
+
+
 def is_cachedir(path):
     """Determines whether the specified path is a cache directory (and
     therefore should potentially be excluded from the backup) according to

diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py
@@ -12,7 +12,7 @@
 from hashlib import sha256
 
 from .. import xattr
-from ..archive import Archive, ChunkBuffer, CHUNK_MAX
+from ..archive import Archive, ChunkBuffer, CHUNK_MAX_EXP
 from ..archiver import Archiver
 from ..cache import Cache
 from ..crypto import bytes_to_long, num_aes_blocks
@@ -213,7 +213,7 @@ def test_sparse_file(self):
         sparse_support = sys.platform != 'darwin'
         filename = os.path.join(self.input_path, 'sparse')
         content = b'foobar'
-        hole_size = 5 * CHUNK_MAX  # 5 full chunker buffers
+        hole_size = 5 * (1 << CHUNK_MAX_EXP)  # 5 full chunker buffers
         with open(filename, 'wb') as fd:
             # create a file that has a hole at the beginning and end (if the
             # OS and filesystem supports sparse files)

diff --git a/borg/testsuite/chunker.py b/borg/testsuite/chunker.py
@@ -1,27 +1,27 @@
 from io import BytesIO
 
 from ..chunker import Chunker, buzhash, buzhash_update
-from ..archive import CHUNK_MAX
+from ..archive import CHUNK_MAX_EXP
 from . import BaseTestCase
 
 
 class ChunkerTestCase(BaseTestCase):
 
     def test_chunkify(self):
-        data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
-        parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
+        data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
+        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
+        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
 
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)

diff --git a/docs/misc/create_chunker-params.txt b/docs/misc/create_chunker-params.txt
@@ -0,0 +1,116 @@
+About borg create --chunker-params
+==================================
+
+--chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE
+
+CHUNK_MIN_EXP and CHUNK_MAX_EXP give the exponent N of the 2^N minimum and
+maximum chunk size. Required: CHUNK_MIN_EXP < CHUNK_MAX_EXP.
+
+Defaults: 10 (2^10 == 1KiB) minimum, 23 (2^23 == 8MiB) maximum.
+
+HASH_MASK_BITS is the number of least-significant bits of the rolling hash
+that need to be zero to trigger a chunk cut.
+Recommended: CHUNK_MIN_EXP + X <= HASH_MASK_BITS <= CHUNK_MAX_EXP - X, X >= 2
+(this allows the rolling hash some freedom to make its cut at a place
+determined by the windows contents rather than the min/max. chunk size).
+
+Default: 16 (statistically, chunks will be about 2^16 == 64kiB in size)
+
+HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation.
+Default: 4095B
+
+
+Trying it out
+=============
+
+I backed up a VM directory to demonstrate how different chunker parameters
+influence repo size, index size / chunk count, compression, deduplication.
+
+repo-sm: ~64kiB chunks (16 bits chunk mask), min chunk size 1kiB (2^10B)
+         (these are attic / borg 0.23 internal defaults)
+
+repo-lg: ~1MiB chunks (20 bits chunk mask), min chunk size 64kiB (2^16B)
+
+repo-xl: 8MiB chunks (2^23B max chunk size), min chunk size 64kiB (2^16B).
+         The chunk mask bits was set to 31, so it (almost) never triggers.
+         This degrades the rolling hash based dedup to a fixed-offset dedup
+         as the cutting point is now (almost) always the end of the buffer
+         (at 2^23B == 8MiB).
+
+The repo index size is an indicator for the RAM needs of Borg.
+In this special case, the total RAM needs are about 2.1x the repo index size.
+You see index size of repo-sm is 16x larger than of repo-lg, which corresponds
+to the ratio of the different target chunk sizes.
+
+Note: RAM needs were not a problem in this specific case (37GB data size).
+      But just imagine, you have 37TB of such data and much less than 42GB RAM,
+      then you'ld definitely want the "lg" chunker params so you only need
+      2.6GB RAM. Or even bigger chunks than shown for "lg" (see "xl").
+
+You also see compression works better for larger chunks, as expected.
+Duplication works worse for larger chunks, also as expected.
+
+small chunks
+============
+
+$ borg info /extra/repo-sm::1
+
+Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 10,23,16,4095 /extra/repo-sm::1 /home/tw/win
+Number of files: 3
+
+                       Original size      Compressed size    Deduplicated size
+This archive:               37.12 GB             14.81 GB             12.18 GB
+All archives:               37.12 GB             14.81 GB             12.18 GB
+
+                       Unique chunks         Total chunks
+Chunk index:                  378374               487316
+
+$ ls -l /extra/repo-sm/index*
+
+-rw-rw-r-- 1 tw tw 20971538 Jun 20 23:39 index.2308
+
+$ du -sk /extra/repo-sm
+11930840    /extra/repo-sm
+
+large chunks
+============
+
+$ borg info /extra/repo-lg::1
+
+Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,20,4095 /extra/repo-lg::1 /home/tw/win
+Number of files: 3
+
+                       Original size      Compressed size    Deduplicated size
+This archive:               37.10 GB             14.60 GB             13.38 GB
+All archives:               37.10 GB             14.60 GB             13.38 GB
+
+                       Unique chunks         Total chunks
+Chunk index:                   25889                29349
+
+$ ls -l /extra/repo-lg/index*
+
+-rw-rw-r-- 1 tw tw 1310738 Jun 20 23:10 index.2264
+
+$ du -sk /extra/repo-lg
+13073928    /extra/repo-lg
+
+xl chunks
+=========
+
+(borg-env)tw@tux:~/w/borg$ borg info /extra/repo-xl::1
+Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,31,4095 /extra/repo-xl::1 /home/tw/win
+Number of files: 3
+
+                       Original size      Compressed size    Deduplicated size
+This archive:               37.10 GB             14.59 GB             14.59 GB
+All archives:               37.10 GB             14.59 GB             14.59 GB
+
+                       Unique chunks         Total chunks
+Chunk index:                    4319                 4434
+
+$ ls -l /extra/repo-xl/index*
+-rw-rw-r-- 1 tw tw 327698 Jun 21 00:52 index.2011
+
+$ du -sk /extra/repo-xl/
+14253464    /extra/repo-xl/
+
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -50,6 +50,9 @@ Examples
     NAME="root-`date +%Y-%m-%d`"
     $ borg create /mnt/backup::$NAME / --do-not-cross-mountpoints
 
+    # Backup huge files with little chunk management overhead
+    $ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs
+
 
 .. include:: usage/extract.rst.inc