From 6a59dbb3768e170b626153d7bde7cce20be8da3e Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Fri, 10 Nov 2023 02:20:07 -0800
Subject: [PATCH] zstd: Improve better/best compression (#877)

* zstd: Improve better/best compression

* Improve filling of hash table.
* Skip similar checks.
* Improve bit estimate precision.

0.02 -> 0.3% improvement observed (input -> output size difference - relative is bigger)
---
 zstd/enc_best.go   | 44 +++++++++++++++++++++++++++-----------------
 zstd/enc_better.go | 17 ++++++++---------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/zstd/enc_best.go b/zstd/enc_best.go
index 858f8f43a5..c81a15357a 100644
--- a/zstd/enc_best.go
+++ b/zstd/enc_best.go
@@ -43,7 +43,7 @@ func (m *match) estBits(bitsPerByte int32) {
 	if m.rep < 0 {
 		ofc = ofCode(uint32(m.s-m.offset) + 3)
 	} else {
-		ofc = ofCode(uint32(m.rep))
+		ofc = ofCode(uint32(m.rep) & 3)
 	}
 	// Cost, excluding
 	ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
@@ -227,7 +227,7 @@ encodeLoop:
 				}
 			}
 			l := 4 + e.matchlen(s+4, offset+4, src)
-			if rep < 0 {
+			if true {
 				// Extend candidate match backwards as far as possible.
 				tMin := s - e.maxMatchOff
 				if tMin < 0 {
@@ -282,6 +282,7 @@ encodeLoop:
 		// Load next and check...
 		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
 		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
+		index0 := s + 1
 
 		// Look far ahead, unless we have a really long match already...
 		if best.length < goodEnough {
@@ -357,19 +358,16 @@ encodeLoop:
 			blk.sequences = append(blk.sequences, seq)
 
 			// Index old s + 1 -> s - 1
-			index0 := s + 1
 			s = best.s + best.length
-
 			nextEmit = s
-			if s >= sLimit {
-				if debugEncoder {
-					println("repeat ended", s, best.length)
-				}
-				break encodeLoop
-			}
+
 			// Index skipped...
+			end := s
+			if s > sLimit+4 {
+				end = sLimit + 4
+			}
 			off := index0 + e.cur
-			for index0 < s {
+			for index0 < end {
 				cv0 := load6432(src, index0)
 				h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
 				h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
@@ -378,6 +376,7 @@ encodeLoop:
 				off++
 				index0++
 			}
+
 			switch best.rep {
 			case 2, 4 | 1:
 				offset1, offset2 = offset2, offset1
@@ -386,12 +385,17 @@ encodeLoop:
 			case 4 | 3:
 				offset1, offset2, offset3 = offset1-1, offset1, offset2
 			}
+			if s >= sLimit {
+				if debugEncoder {
+					println("repeat ended", s, best.length)
+				}
+				break encodeLoop
+			}
 			continue
 		}
 
 		// A 4-byte match has been found. Update recent offsets.
 		// We'll later see if more than 4 bytes.
-		index0 := s + 1
 		s = best.s
 		t := best.offset
 		offset1, offset2, offset3 = s-t, offset1, offset2
@@ -419,19 +423,25 @@ encodeLoop:
 		}
 		blk.sequences = append(blk.sequences, seq)
 		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
+
+		// Index old s + 1 -> s - 1 or sLimit
+		end := s
+		if s > sLimit-4 {
+			end = sLimit - 4
 		}
 
-		// Index old s + 1 -> s - 1
-		for index0 < s {
+		off := index0 + e.cur
+		for index0 < end {
 			cv0 := load6432(src, index0)
 			h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
 			h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
-			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
 			index0++
+			off++
+		}
+		if s >= sLimit {
+			break encodeLoop
 		}
 	}
 
diff --git a/zstd/enc_better.go b/zstd/enc_better.go
index 8582f31a7c..20d25b0e05 100644
--- a/zstd/enc_better.go
+++ b/zstd/enc_better.go
@@ -145,7 +145,7 @@ encodeLoop:
 		var t int32
 		// We allow the encoder to optionally turn off repeat offsets across blocks
 		canRepeat := len(blk.sequences) > 2
-		var matched int32
+		var matched, index0 int32
 
 		for {
 			if debugAsserts && canRepeat && offset1 == 0 {
@@ -162,6 +162,7 @@ encodeLoop:
 			off := s + e.cur
 			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			index0 = s + 1
 
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
@@ -258,7 +259,6 @@ encodeLoop:
 					}
 					blk.sequences = append(blk.sequences, seq)
 
-					index0 := s + repOff2
 					s += lenght + repOff2
 					nextEmit = s
 					if s >= sLimit {
@@ -498,15 +498,15 @@ encodeLoop:
 		}
 
 		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
+		off := index0 + e.cur
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
 			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 			index0 += 2
+			off += 2
 		}
 
 		cv = load6432(src, s)
@@ -672,7 +672,7 @@ encodeLoop:
 		var t int32
 		// We allow the encoder to optionally turn off repeat offsets across blocks
 		canRepeat := len(blk.sequences) > 2
-		var matched int32
+		var matched, index0 int32
 
 		for {
 			if debugAsserts && canRepeat && offset1 == 0 {
@@ -691,6 +691,7 @@ encodeLoop:
 			e.markLongShardDirty(nextHashL)
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
 			e.markShortShardDirty(nextHashS)
+			index0 = s + 1
 
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
@@ -726,7 +727,6 @@ encodeLoop:
 					blk.sequences = append(blk.sequences, seq)
 
 					// Index match start+1 (long) -> s - 1
-					index0 := s + repOff
 					s += lenght + repOff
 
 					nextEmit = s
@@ -790,7 +790,6 @@ encodeLoop:
 					}
 					blk.sequences = append(blk.sequences, seq)
 
-					index0 := s + repOff2
 					s += lenght + repOff2
 					nextEmit = s
 					if s >= sLimit {
@@ -1024,18 +1023,18 @@ encodeLoop:
 		}
 
 		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
+		off := index0 + e.cur
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
 			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.markLongShardDirty(h0)
 			h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
 			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
 			e.markShortShardDirty(h1)
 			index0 += 2
+			off += 2
 		}
 
 		cv = load6432(src, s)