-
Notifications
You must be signed in to change notification settings - Fork 20
/
huffman.hpp
1188 lines (998 loc) · 34.5 KB
/
huffman.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// ================================================================================================
// -*- C++ -*-
// File: huffman.hpp
// Author: Guilherme R. Lampert
// Created on: 15/02/16
// Brief: Minimal implementation of Huffman encoding/decoding in C++11.
// ================================================================================================
#ifndef HUFFMAN_HPP
#define HUFFMAN_HPP
// ---------
// LICENSE
// ---------
// This software is in the public domain. Where that dedication is not recognized,
// you are granted a perpetual, irrevocable license to copy, distribute, and modify
// this file as you see fit.
//
// The source code is provided "as is", without warranty of any kind, express or implied.
// No attribution is required, but a mention about the author is appreciated.
//
// -------------
// QUICK SETUP
// -------------
// In *one* C++ source file, *before* including this file, do this:
//
// #define HUFFMAN_IMPLEMENTATION
//
// To enable the implementation. Further includes of this
// file *should not* redefine HUFFMAN_IMPLEMENTATION.
// Example:
//
// In my_program.cpp:
//
// #define HUFFMAN_IMPLEMENTATION
// #include "huffman.hpp"
//
// In my_program.hpp:
//
// #include "huffman.hpp"
//
// ----------
// OVERVIEW
// ----------
// As-simple-as-possible Huffman encoding and decoding.
//
// The huffman::Encoder and huffman::Decoder classes do
// all the Huffman related work, the rest is just support
// code for bit streams and integer bit manipulation.
//
// easyEncode()/easyDecode() functions are provided
// for quick-n'-easy compression/decompression of raw data
// buffers.
//
// The size of a Huffman code is limited to 64 bits (to fit
// inside a uint64) and no additional handing is done if this
// size is overflowed. It will just log an error and ignore
// further bits.
//
// Symbols are byte-sized so that we can limit the number of leaf
// nodes to 256. There is an extra of 512 nodes for inner nodes,
// but again if that size is exceeded we just log an error and
// ignore.
//
// You can override the HUFFMAN_ERROR() macro to supply your
// own error handling strategy. The default simply writes to
// stderr and calls std::abort().
//
// Memory allocated explicitly by the bit streams will be sourced
// from HUFFMAN_MALLOC/HUFFMAN_MFREE, so you can override the macros
// to add custom memory management. Current that's all the memory
// we allocate directly, but we also use std::priority_queue<> to
// build the Huffman tree and the queue will allocate memory from
// the global heap, so you won't be able to catch that, unfortunately.
//
// The huffman::Node struct is not very optimized for size.
// We use full signed integers for the value and child indexes,
// while the 32-bits range is not strictly needed. All we need
// is the ability to set them to -1 (used as a sentinel value),
// so a signed short would do for the value and child links.
// If you're interested in optimizing for memory usage/size,
// start from there.
//
// The same could also probably be done for huffman::Code.
// If you're sure you'll never need more than 32 bits for
// a code, you could replace the code long-word by a uint32.
// The accompanying code length is an int, which is also overkill,
// since it will never be > 64 (or 32), so if you care about
// saving space in the structure members, you might replace it
// with a byte or short and pack the structure.
//
// --------------
// USEFUL LINKS
// --------------
// Wikipedia:
// https://en.wikipedia.org/wiki/Huffman_coding
//
// Nice quick video tutorial on Huffman coding:
// https://youtu.be/apcCVfXfcqE
#include <cstdint>
#include <cstdlib>
#include <array>
#include <queue>
#include <vector>
// Disable the bit stream => std::string dumping methods.
#ifndef HUFFMAN_NO_STD_STRING
#include <string>
#endif // HUFFMAN_NO_STD_STRING
// If you provide a custom malloc(), you must also provide a custom free().
// Note: We never check HUFFMAN_MALLOC's return for null. A custom implementation
// should just abort with a fatal error if the program runs out of memory.
#ifndef HUFFMAN_MALLOC
#define HUFFMAN_MALLOC std::malloc
#define HUFFMAN_MFREE std::free
#endif // HUFFMAN_MALLOC
namespace huffman
{
// ========================================================
// The default fatalError() function writes to stderr and aborts.
#ifndef HUFFMAN_ERROR
void fatalError(const char * message);
#define HUFFMAN_USING_DEFAULT_ERROR_HANDLER
#define HUFFMAN_ERROR(message) ::huffman::fatalError(message)
#endif // HUFFMAN_ERROR
// ========================================================
// class Code:
// ========================================================
class Code final
{
public:
static constexpr int MaxBits = 64;
Code()
{
clear();
}
void clear()
{
bits = 0;
length = 0;
}
void appendBit(const int bit)
{
if (length == MaxBits)
{
HUFFMAN_ERROR("Max code length exceeded!");
return;
}
// Using one of the Stanford bit-hacks to set/clear the bit:
// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
const std::uint64_t mask = std::uint64_t(1) << length;
bits = (bits & ~mask) | (-bit & mask);
++length;
}
void appendCode(const Code other)
{
for (int b = 0; b < other.getLength(); ++b)
{
appendBit(other.getBit(b));
}
}
int getBit(const int index) const
{
const std::uint64_t mask = std::uint64_t(1) << index;
return !!(bits & mask);
}
#ifndef HUFFMAN_NO_STD_STRING
std::string toBitString() const // Useful for debugging.
{
std::string bitString;
for (int b = 0; b < getLength(); ++b)
{
bitString += (getBit(b) ? '1' : '0');
}
return bitString;
}
#endif // HUFFMAN_NO_STD_STRING
int getLength() const { return length; }
void setLength(const int lenInBits) { length = lenInBits; }
std::uint64_t getAsU64() const { return bits; }
void setAsU64(const std::uint64_t num) { bits = num; }
bool operator == (const Code other) const { return bits == other.bits && length == other.length; }
bool operator != (const Code other) const { return !(*this == other); }
private:
std::uint64_t bits; // The long word storing our code bits, from right to left.
int length; // Bit position within the bits word to write next. Same as the current length in bits.
};
// ========================================================
// class BitStreamWriter:
// ========================================================
class BitStreamWriter final
{
public:
// No copy/assignment.
BitStreamWriter(const BitStreamWriter &) = delete;
BitStreamWriter & operator = (const BitStreamWriter &) = delete;
BitStreamWriter();
explicit BitStreamWriter(int initialSizeInBits, int growthGranularity = 2);
void allocate(int bitsWanted);
void setGranularity(int growthGranularity);
std::uint8_t * release();
void appendBit(int bit);
void appendBitsU64(std::uint64_t num, int bitCount);
void appendCode(Code code);
#ifndef HUFFMAN_NO_STD_STRING
std::string toBitString() const; // Useful for debugging.
void appendBitString(const std::string & bitStr);
#endif // HUFFMAN_NO_STD_STRING
int getByteCount() const;
int getBitCount() const;
const std::uint8_t * getBitStream() const;
~BitStreamWriter();
private:
void internalInit();
static std::uint8_t * allocBytes(int bytesWanted, std::uint8_t * oldPtr, int oldSize);
std::uint8_t * stream; // Growable buffer to store our bits. Heap allocated & owned by the class instance.
int bytesAllocated; // Current size of heap-allocated stream buffer *in bytes*.
int granularity; // Amount bytesAllocated multiplies by when auto-resizing in appendBit().
int currBytePos; // Current byte being written to, from 0 to bytesAllocated-1.
int nextBitPos; // Bit position within the current byte to access next. 0 to 7.
int numBitsWritten; // Number of bits in use from the stream buffer, not including byte-rounding padding.
};
// ========================================================
// class BitStreamReader:
// ========================================================
class BitStreamReader final
{
public:
// No copy/assignment.
BitStreamReader(const BitStreamReader &) = delete;
BitStreamReader & operator = (const BitStreamReader &) = delete;
BitStreamReader(const BitStreamWriter & bitStreamWriter);
BitStreamReader(const std::uint8_t * bitStream, int byteCount, int bitCount);
void reset();
bool readNextBit();
std::uint64_t readBitsU64(int bitCount);
// Basic stream info:
int getByteCount() const { return sizeInBytes; }
int getBitCount() const { return sizeInBits; }
const std::uint8_t * getBitStream() const { return stream; }
// Current Huffman code being read from the stream:
void clearCode() { currCode.clear(); }
Code getCode() const { return currCode; }
int getCodeLength() const { return currCode.getLength(); }
private:
const std::uint8_t * stream; // Pointer to the external bit stream. Not owned by the reader.
const int sizeInBytes; // Size of the stream *in bytes*. Might include padding.
const int sizeInBits; // Size of the stream *in bits*, padding *not* include.
int currBytePos; // Current byte being read in the stream.
int nextBitPos; // Bit position within the current byte to access next. 0 to 7.
int numBitsRead; // Total bits read from the stream so far. Never includes byte-rounding padding.
Code currCode; // Current Huffman code being built from the input bit stream.
};
// ========================================================
// Huffman Tree Node:
// ========================================================
constexpr int Nil = -1;
constexpr int MaxSymbols = 256;
constexpr int MaxNodes = MaxSymbols + 512;
struct Node final
{
int frequency = Nil; // Occurrence count; Nil if not in use.
int leftChild = Nil; // Left gets code 0 assigned to it; Nil initially
int rightChild = Nil; // Right gets code 1 assigned to it; Nil initially.
int value = Nil; // Symbol value of this node. Interpreted as a byte.
Code code; // Huffman code that will be assigned to this node.
bool isValid() const { return frequency != Nil; }
bool isLeaf() const { return leftChild == Nil && rightChild == Nil; }
};
// ========================================================
// Huffman encoder class:
// ========================================================
class Encoder final
{
public:
// No copy/assignment.
Encoder(const Encoder &) = delete;
Encoder & operator = (const Encoder &) = delete;
// Constructor will start the encoding process,
// building the Huffman tree and creating the output stream.
// Call getBitStreamWriter() to fetch the results.
Encoder(const std::uint8_t * data, int dataSizeBytes, bool prependTreeToBitStream);
// Find node can be used by a decoder to reconstruct
// the original data from a bit stream of Huffman codes.
const Node * findNodeForCode(Code code) const;
// Get the bit stream generated from the data.
// The stream will be prefixed with the Huffman tree codes
// if prependTreeToBitStream was set in the constructor.
const BitStreamWriter & getBitStreamWriter() const;
BitStreamWriter & getBitStreamWriter();
// Get the length in bits of the tree data prefixed
// to the stream if prependTreeToBitStream was true.
// Always rounded to a byte boundary.
int getTreePrefixBits() const;
private:
// Priority queue comparator.
// Places nodes with lower frequency first.
struct NodeCmp
{
bool operator()(const Node * a, const Node * b) const
{
return a->frequency > b->frequency;
}
};
// Queue stores pointers into the nodes[] array below.
using PQueue = std::priority_queue<Node *, std::vector<Node *>, NodeCmp>;
// Internal helpers:
void buildHuffmanTree();
void writeTreeBitStream();
void writeDataBitStream(const std::uint8_t * data, int dataSizeBytes);
void countFrequencies(const std::uint8_t * data, int dataSizeBytes);
void recursiveAssignCodes(Node * node, const Node * parent, int bit);
const Node * recursiveFindLeaf(const Node * node, Code code) const;
Node * addInnerNode(int frequency, int child0, int child1);
private:
// Output bit stream (will allocate some heap memory).
BitStreamWriter bitStream;
Node * treeRoot;
int treePrefixBits;
// Fixed-size pool of nodes. We don't explicitly allocate memory in the encoder.
std::array<Node, MaxNodes> nodes;
};
// ========================================================
// Huffman decoder class:
// ========================================================
class Decoder final
{
public:
// No copy/assignment.
Decoder(const Decoder &) = delete;
Decoder & operator = (const Decoder &) = delete;
// Start the decoder from a bit stream:
explicit Decoder(const BitStreamWriter & encodedBitStream);
Decoder(const std::uint8_t * encodedData, int encodedSizeBytes, int encodedSizeBits);
// Runs the decoding loop, writing to the user buffer.
// Returns the number of *bytes* decoded, which might differ
// from dataSizeBytes if there is an error or size mismatch.
int decode(std::uint8_t * data, int dataSizeBytes);
private:
// Internal helpers:
void readPrefixData();
int findMatchingCode(const Code code) const;
// Helps us manipulate the external raw buffer.
BitStreamReader bitStream;
// Assume the output is only storing the leaf nodes.
// We also don't need to store a full Node here, just
// its code, since the value/symbol is implicit by the
// position within the array.
std::array<Code, MaxSymbols> codes;
};
// ========================================================
// easyEncode() / easyDecode():
// ========================================================
// Quick Huffman data compression.
// Output compressed data is heap allocated with HUFFMAN_MALLOC()
// and should be later freed with HUFFMAN_MFREE().
void easyEncode(const std::uint8_t * uncompressed, int uncompressedSizeBytes,
std::uint8_t ** compressed, int * compressedSizeBytes, int * compressedSizeBits);
// Decompress back the output of easyEncode().
// The uncompressed output buffer is assumed to be big enough to hold the uncompressed data,
// if it happens to be smaller, the decoder will return a partial output and the return value
// of this function will be less than uncompressedSizeBytes.
int easyDecode(const std::uint8_t * compressed, int compressedSizeBytes, int compressedSizeBits,
std::uint8_t * uncompressed, int uncompressedSizeBytes);
} // namespace huffman {}
// ================== End of header file ==================
#endif // HUFFMAN_HPP
// ================== End of header file ==================
// ================================================================================================
//
// Huffman Implementation
//
// ================================================================================================
#ifdef HUFFMAN_IMPLEMENTATION
#ifdef HUFFMAN_USING_DEFAULT_ERROR_HANDLER
#include <cstdio> // For the default error handler
#endif // HUFFMAN_USING_DEFAULT_ERROR_HANDLER
#include <cassert>
#include <cstring>
namespace huffman
{
// ========================================================
// Local helpers:
// ========================================================
// Round up to the next power-of-two number, e.g. 37 => 64
static int nextPowerOfTwo(int num)
{
--num;
for (std::size_t i = 1; i < sizeof(num) * 8; i <<= 1)
{
num = num | num >> i;
}
return ++num;
}
// ========================================================
// Count the minimum number of bits required to
// represent the integer 'num', AKA its log2.
static int bitsForInteger(int num)
{
int bits = 0;
while (num > 0)
{
num = num >> 1;
++bits;
}
return bits;
}
// ========================================================
#ifdef HUFFMAN_USING_DEFAULT_ERROR_HANDLER
// Prints a fatal error to stderr and aborts the process.
// This is the default method used by HUFFMAN_ERROR(), but
// you can override the macro to use other error handling
// mechanisms, such as C++ exceptions.
void fatalError(const char * const message)
{
std::fprintf(stderr, "Huffman encoder/decoder error: %s\n", message);
std::abort();
}
#endif // HUFFMAN_USING_DEFAULT_ERROR_HANDLER
// ========================================================
// class BitStreamWriter:
// ========================================================
BitStreamWriter::BitStreamWriter()
{
// 8192 bits for a start (1024 bytes). It will resize if needed.
// Default granularity is 2.
internalInit();
allocate(8192);
}
BitStreamWriter::BitStreamWriter(const int initialSizeInBits, const int growthGranularity)
{
internalInit();
setGranularity(growthGranularity);
allocate(initialSizeInBits);
}
BitStreamWriter::~BitStreamWriter()
{
if (stream != nullptr)
{
HUFFMAN_MFREE(stream);
}
}
void BitStreamWriter::internalInit()
{
stream = nullptr;
bytesAllocated = 0;
granularity = 2;
currBytePos = 0;
nextBitPos = 0;
numBitsWritten = 0;
}
void BitStreamWriter::allocate(int bitsWanted)
{
// Require at least a byte.
if (bitsWanted <= 0)
{
bitsWanted = 8;
}
// Round upwards if needed:
if ((bitsWanted % 8) != 0)
{
bitsWanted = nextPowerOfTwo(bitsWanted);
}
// We might already have the required count.
const int sizeInBytes = bitsWanted / 8;
if (sizeInBytes <= bytesAllocated)
{
return;
}
stream = allocBytes(sizeInBytes, stream, bytesAllocated);
bytesAllocated = sizeInBytes;
}
void BitStreamWriter::appendBit(const int bit)
{
const std::uint32_t mask = std::uint32_t(1) << nextBitPos;
stream[currBytePos] = (stream[currBytePos] & ~mask) | (-bit & mask);
++numBitsWritten;
if (++nextBitPos == 8)
{
nextBitPos = 0;
if (++currBytePos == bytesAllocated)
{
allocate(bytesAllocated * granularity * 8);
}
}
}
void BitStreamWriter::appendBitsU64(const std::uint64_t num, const int bitCount)
{
assert(bitCount <= 64);
for (int b = 0; b < bitCount; ++b)
{
const std::uint64_t mask = std::uint64_t(1) << b;
const int bit = !!(num & mask);
appendBit(bit);
}
}
void BitStreamWriter::appendCode(const Code code)
{
for (int b = 0; b < code.getLength(); ++b)
{
appendBit(code.getBit(b));
}
}
#ifndef HUFFMAN_NO_STD_STRING
void BitStreamWriter::appendBitString(const std::string & bitStr)
{
for (std::size_t i = 0; i < bitStr.length(); ++i)
{
appendBit(bitStr[i] == '0' ? 0 : 1);
}
}
std::string BitStreamWriter::toBitString() const
{
std::string bitString;
int usedBytes = numBitsWritten / 8;
int leftovers = numBitsWritten % 8;
if (leftovers != 0)
{
++usedBytes;
}
assert(usedBytes <= bytesAllocated);
for (int i = 0; i < usedBytes; ++i)
{
const int nBits = (leftovers == 0) ? 8 : (i == usedBytes - 1) ? leftovers : 8;
for (int j = 0; j < nBits; ++j)
{
bitString += (stream[i] & (1 << j) ? '1' : '0');
}
}
return bitString;
}
#endif // HUFFMAN_NO_STD_STRING
std::uint8_t * BitStreamWriter::release()
{
std::uint8_t * oldPtr = stream;
internalInit();
return oldPtr;
}
void BitStreamWriter::setGranularity(const int growthGranularity)
{
granularity = (growthGranularity >= 2) ? growthGranularity : 2;
}
int BitStreamWriter::getByteCount() const
{
int usedBytes = numBitsWritten / 8;
int leftovers = numBitsWritten % 8;
if (leftovers != 0)
{
++usedBytes;
}
assert(usedBytes <= bytesAllocated);
return usedBytes;
}
int BitStreamWriter::getBitCount() const
{
return numBitsWritten;
}
const std::uint8_t * BitStreamWriter::getBitStream() const
{
return stream;
}
std::uint8_t * BitStreamWriter::allocBytes(const int bytesWanted, std::uint8_t * oldPtr, const int oldSize)
{
std::uint8_t * newMemory = static_cast<std::uint8_t *>(HUFFMAN_MALLOC(bytesWanted));
std::memset(newMemory, 0, bytesWanted);
if (oldPtr != nullptr)
{
std::memcpy(newMemory, oldPtr, oldSize);
HUFFMAN_MFREE(oldPtr);
}
return newMemory;
}
// ========================================================
// class BitStreamReader:
// ========================================================
BitStreamReader::BitStreamReader(const BitStreamWriter & bitStreamWriter)
: stream(bitStreamWriter.getBitStream())
, sizeInBytes(bitStreamWriter.getByteCount())
, sizeInBits(bitStreamWriter.getBitCount())
{
reset();
}
BitStreamReader::BitStreamReader(const std::uint8_t * bitStream, const int byteCount, const int bitCount)
: stream(bitStream)
, sizeInBytes(byteCount)
, sizeInBits(bitCount)
{
reset();
}
bool BitStreamReader::readNextBit()
{
if (numBitsRead >= sizeInBits)
{
return false; // We are done.
}
const std::uint32_t mask = std::uint32_t(1) << nextBitPos;
const int bit = !!(stream[currBytePos] & mask);
++numBitsRead;
if (++nextBitPos == 8)
{
nextBitPos = 0;
++currBytePos;
}
currCode.appendBit(bit);
return true;
}
std::uint64_t BitStreamReader::readBitsU64(const int bitCount)
{
assert(bitCount <= 64);
// We can reuse the Code reading infrastructure for this.
// This is arguably a little hackish, but gets the job done...
currCode.clear();
for (int b = 0; b < bitCount; ++b)
{
if (!readNextBit())
{
HUFFMAN_ERROR("Failed to read bits from stream! Unexpected end.");
break;
}
}
return currCode.getAsU64();
}
void BitStreamReader::reset()
{
currBytePos = 0;
nextBitPos = 0;
numBitsRead = 0;
currCode.clear();
}
// ========================================================
// class Encoder:
// ========================================================
Encoder::Encoder(const std::uint8_t * data, const int dataSizeBytes, const bool prependTreeToBitStream)
: treeRoot(nullptr)
, treePrefixBits(0)
{
countFrequencies(data, dataSizeBytes);
buildHuffmanTree();
if (prependTreeToBitStream)
{
writeTreeBitStream();
}
writeDataBitStream(data, dataSizeBytes);
}
void Encoder::buildHuffmanTree()
{
PQueue pQueue;
// Put each symbol node into a priority queue:
for (int s = 0; s < MaxSymbols; ++s)
{
if (nodes[s].isValid())
{
pQueue.push(&nodes[s]);
}
}
// Build the tree using the following algorithm:
//
// While there is more than one node in the queue:
// - Remove the two nodes of highest priority (lowest frequency) from the queue;
// - Create a new internal node with these two nodes as children
// and with frequency equal to the sum of the two nodes' frequencies;
// - Add the new node to the queue;
// Repeat;
//
// The remaining node is the root node and the tree is complete.
//
while (pQueue.size() > 1)
{
const auto child0 = pQueue.top();
pQueue.pop();
const auto child1 = pQueue.top();
pQueue.pop();
pQueue.push(addInnerNode(child0->frequency + child1->frequency, child0->value, child1->value));
}
// Now we can assign the binary codes, starting from 0 at the root:
assert(!pQueue.empty());
treeRoot = pQueue.top();
recursiveAssignCodes(treeRoot, nullptr, 0);
}
Node * Encoder::addInnerNode(const int frequency, const int leftChild, const int rightChild)
{
// Find a free slot:
// First 256 nodes are reserved for the data symbols,
// (leaf nodes) with the inner nodes following.
for (int n = MaxSymbols; n < MaxNodes; ++n)
{
if (!nodes[n].isValid())
{
nodes[n].frequency = frequency;
nodes[n].leftChild = leftChild;
nodes[n].rightChild = rightChild;
nodes[n].value = n;
return &nodes[n];
}
}
HUFFMAN_ERROR("No more free node slots!");
return &nodes[MaxNodes - 1];
}
void Encoder::recursiveAssignCodes(Node * node, const Node * parent, const int bit)
{
// Inherit the parent code if not the root:
if (parent != nullptr)
{
node->code.appendCode(parent->code);
}
// Add the bit for this node:
node->code.appendBit(bit);
// And continue with it's children:
if (node->leftChild != Nil)
{
// Bit zero to the left.
recursiveAssignCodes(&nodes[node->leftChild], node, 0);
}
if (node->rightChild != Nil)
{
// Bit one to the right.
recursiveAssignCodes(&nodes[node->rightChild], node, 1);
}
}
void Encoder::countFrequencies(const std::uint8_t * data, int dataSizeBytes)
{
for (; dataSizeBytes > 0; --dataSizeBytes, ++data)
{
// We'll use the value of each byte as the symbol index, since our table has 256+ entries.
const int nodeIndex = *data;
// First occurrence?
if (!nodes[nodeIndex].isValid())
{
nodes[nodeIndex].frequency = 1;
nodes[nodeIndex].value = *data;
}
else // Recurrent symbol:
{
nodes[nodeIndex].frequency++;
}
}
}
void Encoder::writeDataBitStream(const std::uint8_t * data, int dataSizeBytes)
{
for (; dataSizeBytes > 0; --dataSizeBytes, ++data)
{
// We can index the nodes directly from each byte of data
// since the first 256 slots are reserved for the symbols,
// so Node::value is the same as its index in th array for
// the first 256 leaf nodes.
const int nodeIndex = *data;
bitStream.appendCode(nodes[nodeIndex].code);
}
}
void Encoder::writeTreeBitStream()
{
assert(treeRoot != nullptr);
//
// The length used for the codes will be the shortest one possible.
// We only write the leaf nodes, that's enough to reconstruct
// the data later, so we will output 256 codes (MaxSymbols). The
// index of each code is the implicit value/symbol of that node.
//
// The stream will be prefixed with two 16-bits words, the
// number of codes (always 256/MaxSymbols for now) and the
// width in bits of the code_length field that prefixes
// every code. 256 code instances follow in the format:
//
// +-------------+---------------+
// | code_length | code_bits ... |
// +-------------+---------------+
// ^-- Fixed ^-- Varying width
// width (code_length bits)
//
// Find the longest code so we know the max number
// of bits we will need to represent that value.
int maxCodeLengthInBits = 0;
for (int s = 0; s < MaxSymbols; ++s)
{
if (nodes[s].isValid() && nodes[s].code.getLength() > maxCodeLengthInBits)
{
maxCodeLengthInBits = nodes[s].code.getLength();
}
}
// Code length is currently limited to uint64!
if (maxCodeLengthInBits <= 0 || maxCodeLengthInBits > Code::MaxBits)
{
HUFFMAN_ERROR("Unexpected code length! Should be <= Code::MaxBits.");
return;
}
// Write the counts:
const int numberOfCodes = MaxSymbols;
const int codeLengthWidth = bitsForInteger(maxCodeLengthInBits);
bitStream.appendBitsU64(numberOfCodes, 16);
bitStream.appendBitsU64(codeLengthWidth, 16);
treePrefixBits = 32; // 16 bits each.
// Output the 256 nodes, in order:
for (int s = 0; s < MaxSymbols; ++s)
{
// Write the length of the code using a fixed bit-width:
const int codeLen = nodes[s].code.getLength();
bitStream.appendBitsU64(codeLen, codeLengthWidth);
// Write the code bits themselves, using a varying bit-width:
const std::uint64_t codeNum = nodes[s].code.getAsU64();
bitStream.appendBitsU64(codeNum, codeLen);
// Keep track of the number of bits written so far for later padding.
treePrefixBits += (codeLengthWidth + codeLen);
}
// Pad to a full byte if needed:
while ((treePrefixBits % 8) != 0)
{
bitStream.appendBit(0);
++treePrefixBits;
}
}
const Node * Encoder::recursiveFindLeaf(const Node * node, const Code code) const
{
const Node * result = nullptr;
if (node->leftChild != Nil)
{
result = recursiveFindLeaf(&nodes[node->leftChild], code);
if (result != nullptr)
{
return result;
}
}
if (node->rightChild != Nil)
{
result = recursiveFindLeaf(&nodes[node->rightChild], code);
if (result != nullptr)
{
return result;
}
}
if (node->isLeaf() && node->code == code)
{
result = node;
}
return result;
}
const Node * Encoder::findNodeForCode(const Code code) const
{
return recursiveFindLeaf(treeRoot, code);
}
const BitStreamWriter & Encoder::getBitStreamWriter() const
{
return bitStream;