Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental ROLZ dicts (1 byte context) #973

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion s2/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package s2
import (
"bytes"
"encoding/binary"
"fmt"
"sync"
)

Expand Down Expand Up @@ -34,6 +35,9 @@ type Dict struct {

bestTableShort *[1 << 16]uint32
bestTableLong *[1 << 19]uint32

rolzTab [256][8]uint16
rolzVals [256][8]uint32
}

// NewDict will read a dictionary.
Expand All @@ -60,9 +64,38 @@ func NewDict(dict []byte) *Dict {
if d.repeat > len(dict) {
return nil
}
d.initROLZ()
return &d
}

func (d *Dict) initROLZ() {
for c := 0; c < 256; c++ {
filled := 0
nextEntry:
for i := range d.dict[:len(d.dict)-4] {
if d.dict[i] == byte(c) {
// Don't fill the same 3 bytes several times.
const matchMask = (1 << (3 * 8)) - 1
for j := range d.rolzVals[c][:filled] {
if d.rolzVals[c][j]&matchMask == uint32(i)&matchMask {
continue nextEntry
}
}
d.rolzVals[c][filled] = binary.LittleEndian.Uint32(d.dict[i:])
d.rolzTab[c][filled] = uint16(i + 1)
filled++
if filled == 8 {
break
}
}
}
f := filled - 1
if false && f >= 0 {
fmt.Println(c, "filled", filled, "at index", d.rolzTab[c][f], "of", len(d.dict))
}
}
}

// Bytes will return a serialized version of the dictionary.
// The output can be sent to NewDict.
func (d *Dict) Bytes() []byte {
Expand Down Expand Up @@ -102,6 +135,7 @@ func MakeDict(data []byte, searchStart []byte) *Dict {
break
}
}
d.initROLZ()

return &d
}
Expand All @@ -120,7 +154,7 @@ func MakeDictManual(data []byte, firstIdx uint16) *Dict {
if cap(d.dict) < len(d.dict)+16 {
d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
}

d.initROLZ()
d.repeat = int(firstIdx)
return &d
}
Expand Down
9 changes: 6 additions & 3 deletions s2/dict_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,9 @@ func TestDictBest2(t *testing.T) {
func TestDictSize(t *testing.T) {
//f, err := os.Open("testdata/xlmeta.tar.s2")
//f, err := os.Open("testdata/broken.tar.s2")
f, err := os.Open("testdata/github_users_sample_set.tar.s2")
//f, err := os.Open("testdata/github_users_sample_set.tar.s2")
//f, err := os.Open("testdata/gofiles2.tar.s2")
//f, err := os.Open("testdata/gosrc.tar.s2")
f, err := os.Open("testdata/gosrc.tar.s2")
if err != nil {
t.Skip(err)
}
Expand Down Expand Up @@ -389,6 +389,9 @@ func TestDictSize(t *testing.T) {
}
totalOut += res
encoded = encoded[:res]
if true {
return
}
//t.Log("encoded", len(data), "->", res, "saved", len(data)-res, "bytes")
decoded := make([]byte, len(data))
res = s2DecodeDict(decoded, encoded, d)
Expand All @@ -402,7 +405,7 @@ func TestDictSize(t *testing.T) {
}
})
}
t.Logf("%d files, %d -> %d (%.2f%%) - %.02f bytes saved/file\n", totalCount, totalIn, totalOut, float64(totalOut*100)/float64(totalIn), float64(totalIn-totalOut)/float64(totalCount))
fmt.Printf("%d files, %d -> %d (%.2f%%) - %.02f bytes saved/file\n", totalCount, totalIn, totalOut, float64(totalOut*100)/float64(totalIn), float64(totalIn-totalOut)/float64(totalCount))
}

func FuzzDictBlocks(f *testing.F) {
Expand Down
81 changes: 81 additions & 0 deletions s2/encode_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
s int
length int
score int
rolzIdx uint8
rep, dict bool
rolz bool
}
var best match
for {
Expand Down Expand Up @@ -111,10 +113,15 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if m.rolz {
// One byte to emit.
return score - 1 - emitRepeatSize(65536, m.length-10)
}
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}

offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
Expand Down Expand Up @@ -215,6 +222,54 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
}
return m
}
matchDictROLZ := func(s int, first uint32) match {
if false {
return match{s: s}
}
tab := &dict.rolzVals[first&255]
idx := -1
for i, v := range tab {
if v == first {
if v == 0 {
return match{s: s}
}
idx = i
break
}
}
if idx < 0 {
return match{s: s}
}
offset := int(dict.rolzTab[first&255][idx])
m := match{rolzIdx: uint8(idx), s: s + 1, length: offset + 3, dict: true, rolz: true, offset: -len(dict.dict) + offset}
s += 4
for s < sLimitDict && m.length < len(dict.dict) {
if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
if src[s] == dict.dict[m.length] {
m.length++
s++
continue
}
break
}
if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
m.length += bits.TrailingZeros64(diff) >> 3
break
}
s += 8
m.length += 8
}
m.length -= offset
m.score = score(m)
if false {
fmt.Println("ROLZ", m.length, "SCORE", m.score+m.s)
}
if m.score < -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return m
}

bestOf := func(a, b match) match {
if b.length == 0 {
Expand Down Expand Up @@ -243,6 +298,11 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
//if nextEmit < s {
if s >= 1 {
best = bestOf(best, matchDictROLZ(s-1, load32(src, s-1)))
}
best = bestOf(best, matchDictROLZ(s, uint32(cv)))
}
{
if (dict == nil || repeat <= s) && repeat > 0 {
Expand Down Expand Up @@ -274,6 +334,7 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {

best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDictROLZ(s, uint32(cv)))
}

// s+2
Expand Down Expand Up @@ -306,6 +367,7 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {

best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
best = bestOf(best, matchDictROLZ(s, uint32(cv)))
}
}
// Search for a match at best match end, see if that is better.
Expand Down Expand Up @@ -405,6 +467,13 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
}
d += emitCopy(dst[d:], offset, best.length)
}
} else if best.rolz {
if false {
fmt.Println("ROLZ, length", best.length, "idx", best.rolzIdx, "offset:", offset, "s-after:", s)
}

d += emitROLZ(dst[d:], best.rolzIdx, best.length)
offset = best.offset
} else {
if debug {
fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
Expand Down Expand Up @@ -447,6 +516,15 @@ emitRemainder:
return d
}

func emitROLZ(d []byte, idx uint8, length int) int {
if length > 10 {
d[0] = uint8(idx<<5) | (0x7 << 2)
return emitRepeat(d[1:], -1, length-10) + 1
}
d[0] = idx<<5 | uint8(length-3)<<2
return 1
}

// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
Expand Down Expand Up @@ -774,6 +852,9 @@ func emitCopyNoRepeatSize(offset, length int) int {
// Length must be at least 4 and < 1<<24
func emitRepeatSize(offset, length int) int {
// Repeat offset, make length cheaper
if length <= 0 {
return 0
}
if length <= 4+4 || (length < 8+4 && offset < 2048) {
return 2
}
Expand Down
Loading