Skip to content

Commit

Permalink
Experimental ROLZ dicts (1 byte context)
Browse files Browse the repository at this point in the history
```
BEFORE:

8912 files, 51253563 -> 15870944 (30.97%) - 3970.22 bytes saved/file

ROLZ after LIT:

8912 files, 51253563 -> 15866609 (30.96%) - 3970.71 bytes saved/file

ROLZ after LIT+COPY

8912 files, 51253563 -> 15861266 (30.95%) - 3971.31 bytes saved/file

```
  • Loading branch information
klauspost committed Jun 24, 2024
1 parent 8411e1d commit edacb43
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 4 deletions.
36 changes: 35 additions & 1 deletion s2/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package s2
import (
"bytes"
"encoding/binary"
"fmt"
"sync"
)

Expand Down Expand Up @@ -34,6 +35,9 @@ type Dict struct {

bestTableShort *[1 << 16]uint32
bestTableLong *[1 << 19]uint32

rolzTab [256][8]uint16
rolzVals [256][8]uint32
}

// NewDict will read a dictionary.
Expand All @@ -60,9 +64,38 @@ func NewDict(dict []byte) *Dict {
if d.repeat > len(dict) {
return nil
}
d.initROLZ()
return &d
}

func (d *Dict) initROLZ() {
for c := 0; c < 256; c++ {
filled := 0
nextEntry:
for i := range d.dict[:len(d.dict)-4] {
if d.dict[i] == byte(c) {
// Don't fill the same 3 bytes several times.
const matchMask = (1 << (3 * 8)) - 1
for j := range d.rolzVals[c][:filled] {
if d.rolzVals[c][j]&matchMask == uint32(i)&matchMask {
continue nextEntry
}
}
d.rolzVals[c][filled] = binary.LittleEndian.Uint32(d.dict[i:])
d.rolzTab[c][filled] = uint16(i + 1)
filled++
if filled == 8 {
break
}
}
}
f := filled - 1
if false && f >= 0 {
fmt.Println(c, "filled", filled, "at index", d.rolzTab[c][f], "of", len(d.dict))
}
}
}

// Bytes will return a serialized version of the dictionary.
// The output can be sent to NewDict.
func (d *Dict) Bytes() []byte {
Expand Down Expand Up @@ -102,6 +135,7 @@ func MakeDict(data []byte, searchStart []byte) *Dict {
break
}
}
d.initROLZ()

return &d
}
Expand All @@ -120,7 +154,7 @@ func MakeDictManual(data []byte, firstIdx uint16) *Dict {
if cap(d.dict) < len(d.dict)+16 {
d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
}

d.initROLZ()
d.repeat = int(firstIdx)
return &d
}
Expand Down
9 changes: 6 additions & 3 deletions s2/dict_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,9 @@ func TestDictBest2(t *testing.T) {
func TestDictSize(t *testing.T) {
//f, err := os.Open("testdata/xlmeta.tar.s2")
//f, err := os.Open("testdata/broken.tar.s2")
f, err := os.Open("testdata/github_users_sample_set.tar.s2")
//f, err := os.Open("testdata/github_users_sample_set.tar.s2")
//f, err := os.Open("testdata/gofiles2.tar.s2")
//f, err := os.Open("testdata/gosrc.tar.s2")
f, err := os.Open("testdata/gosrc.tar.s2")
if err != nil {
t.Skip(err)
}
Expand Down Expand Up @@ -389,6 +389,9 @@ func TestDictSize(t *testing.T) {
}
totalOut += res
encoded = encoded[:res]
if true {
return
}
//t.Log("encoded", len(data), "->", res, "saved", len(data)-res, "bytes")
decoded := make([]byte, len(data))
res = s2DecodeDict(decoded, encoded, d)
Expand All @@ -402,7 +405,7 @@ func TestDictSize(t *testing.T) {
}
})
}
t.Logf("%d files, %d -> %d (%.2f%%) - %.02f bytes saved/file\n", totalCount, totalIn, totalOut, float64(totalOut*100)/float64(totalIn), float64(totalIn-totalOut)/float64(totalCount))
fmt.Printf("%d files, %d -> %d (%.2f%%) - %.02f bytes saved/file\n", totalCount, totalIn, totalOut, float64(totalOut*100)/float64(totalIn), float64(totalIn-totalOut)/float64(totalCount))
}

func FuzzDictBlocks(f *testing.F) {
Expand Down
81 changes: 81 additions & 0 deletions s2/encode_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
s int
length int
score int
rolzIdx uint8
rep, dict bool
rolz bool
}
var best match
for {
Expand Down Expand Up @@ -111,10 +113,15 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if m.rolz {
// One byte to emit.
return score - 1 - emitRepeatSize(65536, m.length-10)
}
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}

offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
Expand Down Expand Up @@ -215,6 +222,54 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
}
return m
}
matchDictROLZ := func(s int, first uint32) match {
if false {
return match{s: s}
}
tab := &dict.rolzVals[first&255]
idx := -1
for i, v := range tab {
if v == first {
if v == 0 {
return match{s: s}
}
idx = i
break
}
}
if idx < 0 {
return match{s: s}
}
offset := int(dict.rolzTab[first&255][idx])
m := match{rolzIdx: uint8(idx), s: s + 1, length: offset + 3, dict: true, rolz: true, offset: -len(dict.dict) + offset}
s += 4
for s < sLimitDict && m.length < len(dict.dict) {
if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
if src[s] == dict.dict[m.length] {
m.length++
s++
continue
}
break
}
if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
m.length += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
m.length += 8
}
m.length -= offset
m.score = score(m)
if false {
fmt.Println("ROLZ", m.length, "SCORE", m.score+m.s)
}
if m.score < -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return m
}

bestOf := func(a, b match) match {
if b.length == 0 {
Expand Down Expand Up @@ -243,6 +298,11 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
//if nextEmit < s {
if s >= 1 {
best = bestOf(best, matchDictROLZ(s-1, load32(src, s-1)))
}
best = bestOf(best, matchDictROLZ(s, uint32(cv)))
}
{
if (dict == nil || repeat <= s) && repeat > 0 {
Expand Down Expand Up @@ -274,6 +334,7 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {

best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDictROLZ(s, uint32(cv)))
}

// s+2
Expand Down Expand Up @@ -306,6 +367,7 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {

best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDictROLZ(s, uint32(cv)))
}
}
// Search for a match at best match end, see if that is better.
Expand Down Expand Up @@ -405,6 +467,13 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
}
d += emitCopy(dst[d:], offset, best.length)
}
} else if best.rolz {
if false {
fmt.Println("ROLZ, length", best.length, "idx", best.rolzIdx, "offset:", offset, "s-after:", s)
}

d += emitROLZ(dst[d:], best.rolzIdx, best.length)
offset = best.offset
} else {
if debug {
fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
Expand Down Expand Up @@ -447,6 +516,15 @@ emitRemainder:
return d
}

func emitROLZ(d []byte, idx uint8, length int) int {
if length > 10 {
d[0] = uint8(idx<<5) | (0x7 << 2)
return emitRepeat(d[1:], -1, length-10) + 1
}
d[0] = idx<<5 | uint8(length-3)<<2
return 1
}

// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
Expand Down Expand Up @@ -774,6 +852,9 @@ func emitCopyNoRepeatSize(offset, length int) int {
// Length must be at least 4 and < 1<<24
func emitRepeatSize(offset, length int) int {
// Repeat offset, make length cheaper
if length <= 0 {
return 0
}
if length <= 4+4 || (length < 8+4 && offset < 2048) {
return 2
}
Expand Down

0 comments on commit edacb43

Please sign in to comment.