Skip to content

Commit

Permalink
zstd: Improve better/best compression
Browse files Browse the repository at this point in the history
* Improve filling of hash table.
* Skip similar checks.
* Improve bit estimate precision.

0.02 -> 0.3% improvement observed (input -> output size difference - relative is bigger)
  • Loading branch information
klauspost committed Oct 23, 2023
1 parent 77a82e9 commit 5ab4ab2
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 26 deletions.
53 changes: 36 additions & 17 deletions zstd/enc_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func (m *match) estBits(bitsPerByte int32) {
if m.rep < 0 {
ofc = ofCode(uint32(m.s-m.offset) + 3)
} else {
ofc = ofCode(uint32(m.rep))
ofc = ofCode(uint32(m.rep) & 3)
}
// Cost, excluding
ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
Expand Down Expand Up @@ -201,6 +201,15 @@ encodeLoop:
if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first {
return
}
if m.length > 0 && delta == m.s-m.offset {
// If we have already checked offset,
// skip, but update with rep if it matches.
if m.rep < rep {
m.rep = rep
m.estBits(bitsPerByte)
}
return
}
if debugAsserts {
if offset >= s {
panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff))
Expand All @@ -227,7 +236,7 @@ encodeLoop:
}
}
l := 4 + e.matchlen(s+4, offset+4, src)
if rep < 0 {
if true {
// Extend candidate match backwards as far as possible.
tMin := s - e.maxMatchOff
if tMin < 0 {
Expand Down Expand Up @@ -282,6 +291,7 @@ encodeLoop:
// Load next and check...
e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
index0 := s + 1

// Look far ahead, unless we have a really long match already...
if best.length < goodEnough {
Expand Down Expand Up @@ -357,19 +367,16 @@ encodeLoop:
blk.sequences = append(blk.sequences, seq)

// Index old s + 1 -> s - 1
index0 := s + 1
s = best.s + best.length

nextEmit = s
if s >= sLimit {
if debugEncoder {
println("repeat ended", s, best.length)
}
break encodeLoop
}

// Index skipped...
end := s
if s > sLimit+4 {
end = sLimit + 4
}
off := index0 + e.cur
for index0 < s {
for index0 < end {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
Expand All @@ -378,6 +385,7 @@ encodeLoop:
off++
index0++
}

switch best.rep {
case 2, 4 | 1:
offset1, offset2 = offset2, offset1
Expand All @@ -386,12 +394,17 @@ encodeLoop:
case 4 | 3:
offset1, offset2, offset3 = offset1-1, offset1, offset2
}
if s >= sLimit {
if debugEncoder {
println("repeat ended", s, best.length)
}
break encodeLoop
}
continue
}

// A 4-byte match has been found. Update recent offsets.
// We'll later see if more than 4 bytes.
index0 := s + 1
s = best.s
t := best.offset
offset1, offset2, offset3 = s-t, offset1, offset2
Expand Down Expand Up @@ -419,19 +432,25 @@ encodeLoop:
}
blk.sequences = append(blk.sequences, seq)
nextEmit = s
if s >= sLimit {
break encodeLoop

// Index old s + 1 -> s - 1 or sLimit
end := s
if s > sLimit-4 {
end = sLimit - 4
}

// Index old s + 1 -> s - 1
for index0 < s {
off := index0 + e.cur
for index0 < end {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
index0++
off++
}
if s >= sLimit {
break encodeLoop
}
}

Expand Down
17 changes: 8 additions & 9 deletions zstd/enc_better.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ encodeLoop:
var t int32
// We allow the encoder to optionally turn off repeat offsets across blocks
canRepeat := len(blk.sequences) > 2
var matched int32
var matched, index0 int32

for {
if debugAsserts && canRepeat && offset1 == 0 {
Expand All @@ -162,6 +162,7 @@ encodeLoop:
off := s + e.cur
e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
index0 = s + 1

if canRepeat {
if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
Expand Down Expand Up @@ -258,7 +259,6 @@ encodeLoop:
}
blk.sequences = append(blk.sequences, seq)

index0 := s + repOff2
s += lenght + repOff2
nextEmit = s
if s >= sLimit {
Expand Down Expand Up @@ -498,15 +498,15 @@ encodeLoop:
}

// Index match start+1 (long) -> s - 1
index0 := s - l + 1
off := index0 + e.cur
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
index0 += 2
off += 2
}

cv = load6432(src, s)
Expand Down Expand Up @@ -672,7 +672,7 @@ encodeLoop:
var t int32
// We allow the encoder to optionally turn off repeat offsets across blocks
canRepeat := len(blk.sequences) > 2
var matched int32
var matched, index0 int32

for {
if debugAsserts && canRepeat && offset1 == 0 {
Expand All @@ -691,6 +691,7 @@ encodeLoop:
e.markLongShardDirty(nextHashL)
e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
e.markShortShardDirty(nextHashS)
index0 = s + 1

if canRepeat {
if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
Expand Down Expand Up @@ -726,7 +727,6 @@ encodeLoop:
blk.sequences = append(blk.sequences, seq)

// Index match start+1 (long) -> s - 1
index0 := s + repOff
s += lenght + repOff

nextEmit = s
Expand Down Expand Up @@ -790,7 +790,6 @@ encodeLoop:
}
blk.sequences = append(blk.sequences, seq)

index0 := s + repOff2
s += lenght + repOff2
nextEmit = s
if s >= sLimit {
Expand Down Expand Up @@ -1024,18 +1023,18 @@ encodeLoop:
}

// Index match start+1 (long) -> s - 1
index0 := s - l + 1
off := index0 + e.cur
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.markLongShardDirty(h0)
h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
e.markShortShardDirty(h1)
index0 += 2
off += 2
}

cv = load6432(src, s)
Expand Down

0 comments on commit 5ab4ab2

Please sign in to comment.