From e8fce38954efd6bc58a5b0ec72dd26f41c8365e6 Mon Sep 17 00:00:00 2001
From: Ilya Tokar <tokarip@google.com>
Date: Wed, 18 Sep 2024 17:36:37 -0400
Subject: [PATCH 01/10] Optimize compression by avoiding unpredictable branches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid unpredictable branch. Use conditional move to generate the address
that is guaranteed to be safe and compare unconditionally.
Instead of

if (idx < limit && x[idx] == val ) // mispredicted idx < limit branch

Do

addr = cmov(safe,x+idx)
if (*addr == val && idx < limit) // almost always false so well predicted

Using microbenchmarks from https://github.com/google/fleetbench,
I get about ~10% speed-up:

name                                                                                          old cpu/op   new cpu/op    delta
BM_ZSTD_COMPRESS_Fleet/compression_level:-7/window_log:15                                     1.46ns ± 3%   1.31ns ± 7%   -9.88%  (p=0.000 n=35+38)
BM_ZSTD_COMPRESS_Fleet/compression_level:-7/window_log:16                                     1.41ns ± 3%   1.28ns ± 3%   -9.56%  (p=0.000 n=36+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-5/window_log:15                                     1.61ns ± 1%   1.43ns ± 3%  -10.70%  (p=0.000 n=30+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-5/window_log:16                                     1.54ns ± 2%   1.39ns ± 3%   -9.21%  (p=0.000 n=37+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-3/window_log:15                                     1.82ns ± 2%   1.61ns ± 3%  -11.31%  (p=0.000 n=37+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:-3/window_log:16                                     1.73ns ± 3%   1.56ns ± 3%   -9.50%  (p=0.000 n=38+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-1/window_log:15                                     2.12ns ± 2%   1.79ns ± 3%  -15.55%  (p=0.000 n=34+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-1/window_log:16                                     1.99ns ± 3%   1.72ns ± 3%  -13.70%  (p=0.000 n=38+38)
BM_ZSTD_COMPRESS_Fleet/compression_level:0/window_log:15                                      3.22ns ± 3%   2.94ns ± 3%   -8.67%  (p=0.000 n=38+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:0/window_log:16                                      3.19ns ± 4%   2.86ns ± 4%  -10.55%  (p=0.000 n=40+38)
BM_ZSTD_COMPRESS_Fleet/compression_level:1/window_log:15                                      2.60ns ± 3%   2.22ns ± 3%  -14.53%  (p=0.000 n=40+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:1/window_log:16                                      2.46ns ± 3%   2.13ns ± 2%  -13.67%  (p=0.000 n=39+36)
BM_ZSTD_COMPRESS_Fleet/compression_level:2/window_log:15                                      2.69ns ± 3%   2.46ns ± 3%   -8.63%  (p=0.000 n=37+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:2/window_log:16                                      2.63ns ± 3%   2.36ns ± 3%  -10.47%  (p=0.000 n=40+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:3/window_log:15                                      3.20ns ± 2%   2.95ns ± 3%   -7.94%  (p=0.000 n=35+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:3/window_log:16                                      3.20ns ± 4%   2.87ns ± 4%  -10.33%  (p=0.000 n=40+40)

I've also measured the impact on internal workloads and saw similar
~10% improvement in performance, measured by cpu usage/byte of data.
---
 lib/compress/zstd_compress_internal.h | 17 ++++++++++++
 lib/compress/zstd_double_fast.c       | 37 +++++++++++++++++----------
 lib/compress/zstd_fast.c              | 32 ++++++++++++++---------
 3 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index ba1450852e..cbcded28db 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -557,6 +557,23 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
     return 1;
 }
 
+/* ZSTD_selectAddr:
+ * @return a >= b ? trueAddr : falseAddr,
+ * tries to force branchless codegen. */
+MEM_STATIC const BYTE* ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) {
+#if defined(__GNUC__) && defined(__x86_64__)
+    __asm__ (
+        "cmp %1, %2\n"
+        "cmova %3, %0\n"
+        : "+r"(trueAddr)
+        : "r"(a), "r"(b), "r"(falseAddr)
+        );
+    return trueAddr;
+#else
+    return a >= b ? trueAddr : falseAddr;
+#endif
+}
+
 /* ZSTD_noCompressBlock() :
  * Writes uncompressed block to dst buffer from given src.
  * Returns the size of the block */
diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c
index 1b5f64f20b..819658caf7 100644
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@@ -140,11 +140,17 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     U32 idxl1; /* the long match index for ip1 */
 
     const BYTE* matchl0; /* the long match for ip */
+    const BYTE* matchl0_safe; /* matchl0 or safe address */
     const BYTE* matchs0; /* the short match for ip */
     const BYTE* matchl1; /* the long match for ip1 */
+    const BYTE* matchs0_safe; /* matchs0 or safe address */
 
     const BYTE* ip = istart; /* the current position */
     const BYTE* ip1; /* the next position */
+    /* Array of ~random data, should have low probability of matching data
+     * we load from here instead of from tables, if matchl0/matchl1 are
+     * invalid indices. Used to avoid unpredictable branches. */
+    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
 
     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
 
@@ -191,24 +197,29 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
 
             hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
 
-            if (idxl0 > prefixLowestIndex) {
-                /* check prefix long match */
-                if (MEM_read64(matchl0) == MEM_read64(ip)) {
-                    mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
-                    offset = (U32)(ip-matchl0);
-                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
-                    goto _match_found;
-                }
+            /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch.
+             * However expression below complies into conditional move. Since
+             * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
+             * if there is a match, all branches become predictable. */
+            matchl0_safe = ZSTD_selectAddr(prefixLowestIndex, idxl0, &dummy[0], matchl0);
+
+            /* check prefix long match */
+            if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) {
+                mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                offset = (U32)(ip-matchl0);
+                while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                goto _match_found;
             }
 
             idxl1 = hashLong[hl1];
             matchl1 = base + idxl1;
 
-            if (idxs0 > prefixLowestIndex) {
-                /* check prefix short match */
-                if (MEM_read32(matchs0) == MEM_read32(ip)) {
-                    goto _search_next_long;
-                }
+            /* Same optimization as matchl0 above */
+            matchs0_safe = ZSTD_selectAddr(prefixLowestIndex, idxs0, &dummy[0], matchs0);
+
+            /* check prefix short match */
+            if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) {
+                  goto _search_next_long;
             }
 
             if (ip1 >= nextStep) {
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index c6baa49d4d..838a18ee5c 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -162,6 +162,11 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    /* Array of ~random data, should have low probability of matching data
+     * we load from here instead of from tables, if the index is invalid.
+     * Used to avoid unpredictable branches. */
+    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
+    const BYTE *mvalAddr;
 
     const BYTE* anchor = istart;
     const BYTE* ip0 = istart;
@@ -246,15 +251,18 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
             goto _match;
         }
 
+        /* idx >= prefixStartIndex is a (somewhat) unpredictable branch.
+         * However expression below complies into conditional move. Since
+         * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
+         * if there is a match, all branches become predictable. */
+        mvalAddr = base + idx;
+        mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]);
+
         /* load match for ip[0] */
-        if (idx >= prefixStartIndex) {
-            mval = MEM_read32(base + idx);
-        } else {
-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
-        }
+        mval = MEM_read32(mvalAddr);
 
         /* check match at ip[0] */
-        if (MEM_read32(ip0) == mval) {
+        if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
             /* found a match! */
 
             /* First write next hash table entry; we've already calculated it.
@@ -281,15 +289,15 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
+        mvalAddr = base + idx;
+        mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]);
+
         /* load match for ip[0] */
-        if (idx >= prefixStartIndex) {
-            mval = MEM_read32(base + idx);
-        } else {
-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
-        }
+        mval = MEM_read32(mvalAddr);
+
 
         /* check match at ip[0] */
-        if (MEM_read32(ip0) == mval) {
+        if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
             /* found a match! */
 
             /* first write next hash table entry; we've already calculated it */

From 1e7fa242f4aa71c3aec5e1e39ec69ad54e117051 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Mon, 7 Oct 2024 11:22:40 -0700
Subject: [PATCH 02/10] minor refactor zstd_fast

make hot variables more local
---
 lib/compress/zstd_compress_internal.h |  4 +-
 lib/compress/zstd_fast.c              | 71 +++++++++++++--------------
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index cbcded28db..53c3bdee8b 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -560,7 +560,9 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
 /* ZSTD_selectAddr:
  * @return a >= b ? trueAddr : falseAddr,
  * tries to force branchless codegen. */
-MEM_STATIC const BYTE* ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) {
+MEM_STATIC const BYTE*
+ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr)
+{
 #if defined(__GNUC__) && defined(__x86_64__)
     __asm__ (
         "cmp %1, %2\n"
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 838a18ee5c..0b6230150a 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -166,7 +166,6 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
      * we load from here instead of from tables, if the index is invalid.
      * Used to avoid unpredictable branches. */
     const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
-    const BYTE *mvalAddr;
 
     const BYTE* anchor = istart;
     const BYTE* ip0 = istart;
@@ -182,7 +181,6 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     size_t hash0; /* hash for ip0 */
     size_t hash1; /* hash for ip1 */
     U32 idx; /* match idx for ip0 */
-    U32 mval; /* src value at match idx */
 
     U32 offcode;
     const BYTE* match0;
@@ -255,22 +253,21 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
          * However expression below complies into conditional move. Since
          * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
          * if there is a match, all branches become predictable. */
-        mvalAddr = base + idx;
-        mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]);
+        {   const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]);
+            /* load match for ip[0] */
+            U32 const mval = MEM_read32(mvalAddr);
 
-        /* load match for ip[0] */
-        mval = MEM_read32(mvalAddr);
-
-        /* check match at ip[0] */
-        if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
-            /* found a match! */
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
+                /* found a match! */
 
-            /* First write next hash table entry; we've already calculated it.
-             * This write is known to be safe because the ip1 == ip0 + 1, so
-             * we know we will resume searching after ip1 */
-            hashTable[hash1] = (U32)(ip1 - base);
+                /* Write next hash table entry (it's already calculated).
+                * This write is known to be safe because the ip1 == ip0 + 1,
+                * so searching will resume after ip1 */
+                hashTable[hash1] = (U32)(ip1 - base);
 
-            goto _offset;
+                goto _offset;
+            }
         }
 
         /* lookup ip[1] */
@@ -289,32 +286,30 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
-        mvalAddr = base + idx;
-        mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]);
-
-        /* load match for ip[0] */
-        mval = MEM_read32(mvalAddr);
+        {   const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]);
+            /* load match for ip[0] */
+            U32 const mval = MEM_read32(mvalAddr);
 
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
+                /* found a match! */
 
-        /* check match at ip[0] */
-        if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
-            /* found a match! */
+                /* first write next hash table entry; we've already calculated it */
+                if (step <= 4) {
+                    /* We need to avoid writing an index into the hash table >= the
+                    * position at which we will pick up our searching after we've
+                    * taken this match.
+                    *
+                    * The minimum possible match has length 4, so the earliest ip0
+                    * can be after we take this match will be the current ip0 + 4.
+                    * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                    * write this position.
+                    */
+                    hashTable[hash1] = (U32)(ip1 - base);
+                }
 
-            /* first write next hash table entry; we've already calculated it */
-            if (step <= 4) {
-                /* We need to avoid writing an index into the hash table >= the
-                 * position at which we will pick up our searching after we've
-                 * taken this match.
-                 *
-                 * The minimum possible match has length 4, so the earliest ip0
-                 * can be after we take this match will be the current ip0 + 4.
-                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
-                 * write this position.
-                 */
-                hashTable[hash1] = (U32)(ip1 - base);
+                goto _offset;
             }
-
-            goto _offset;
         }
 
         /* lookup ip[1] */
@@ -554,7 +549,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
             hashTable[hash0] = curr;   /* update hash table */
 
-            if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) 
+            if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
                 && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
                 const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
                 mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;

From 2cc600bab21657ccf966ceadfeb316e2eacff25c Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Tue, 8 Oct 2024 11:10:48 -0700
Subject: [PATCH 03/10] refactor search into an inline function

for easier swapping with a parameter
---
 lib/compress/zstd_compress_internal.h |  4 +-
 lib/compress/zstd_fast.c              | 93 +++++++++++++--------------
 2 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index 53c3bdee8b..64618848da 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -561,14 +561,14 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
  * @return a >= b ? trueAddr : falseAddr,
  * tries to force branchless codegen. */
 MEM_STATIC const BYTE*
-ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr)
+ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* trueAddr, const BYTE* falseAddr)
 {
 #if defined(__GNUC__) && defined(__x86_64__)
     __asm__ (
         "cmp %1, %2\n"
         "cmova %3, %0\n"
         : "+r"(trueAddr)
-        : "r"(a), "r"(b), "r"(falseAddr)
+        : "r"(index), "r"(lowLimit), "r"(falseAddr)
         );
     return trueAddr;
 #else
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 0b6230150a..94e6300eb4 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -97,6 +97,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
 }
 
 
+static int
+ZSTD_findMatch_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit,  const BYTE* fakeAddress)
+{
+    /* idx >= prefixStartIndex is a (somewhat) unpredictable branch.
+     * However expression below complies into conditional move. Since
+     * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
+     * if there is a match, all branches become predictable. */
+    const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress);
+    return ((MEM_read32(currentPtr) == MEM_read32(mvalAddr)) & (currentIdx >= lowLimit));
+}
+
+
 /**
  * If you squint hard enough (and ignore repcodes), the search operation at any
  * given position is broken into 4 stages:
@@ -148,13 +160,12 @@ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_compressBlock_fast_noDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const mls, U32 const hasStep)
+        U32 const mls, U32 const unpredictable)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32 const hlog = cParams->hashLog;
-    /* support stepSize of 0 */
-    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
@@ -193,6 +204,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     size_t step;
     const BYTE* nextStep;
     const size_t kStepIncr = (1 << (kSearchStrength - 1));
+    (void)unpredictable;
 
     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
     ip0 += (ip0 == prefixStart);
@@ -249,25 +261,15 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
             goto _match;
         }
 
-        /* idx >= prefixStartIndex is a (somewhat) unpredictable branch.
-         * However expression below complies into conditional move. Since
-         * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
-         * if there is a match, all branches become predictable. */
-        {   const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]);
-            /* load match for ip[0] */
-            U32 const mval = MEM_read32(mvalAddr);
+         if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) {
+            /* found a match! */
 
-            /* check match at ip[0] */
-            if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
-                /* found a match! */
-
-                /* Write next hash table entry (it's already calculated).
-                * This write is known to be safe because the ip1 == ip0 + 1,
-                * so searching will resume after ip1 */
-                hashTable[hash1] = (U32)(ip1 - base);
+            /* Write next hash table entry (it's already calculated).
+            * This write is known to be safe because the ip1 == ip0 + 1,
+            * so searching will resume after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
 
-                goto _offset;
-            }
+            goto _offset;
         }
 
         /* lookup ip[1] */
@@ -286,30 +288,24 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
-        {   const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]);
-            /* load match for ip[0] */
-            U32 const mval = MEM_read32(mvalAddr);
-
-            /* check match at ip[0] */
-            if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
-                /* found a match! */
-
-                /* first write next hash table entry; we've already calculated it */
-                if (step <= 4) {
-                    /* We need to avoid writing an index into the hash table >= the
-                    * position at which we will pick up our searching after we've
-                    * taken this match.
-                    *
-                    * The minimum possible match has length 4, so the earliest ip0
-                    * can be after we take this match will be the current ip0 + 4.
-                    * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
-                    * write this position.
-                    */
-                    hashTable[hash1] = (U32)(ip1 - base);
-                }
-
-                goto _offset;
+         if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) {
+            /* found a match! */
+
+            /* first write next hash table entry; it's already calculated */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                * position at which we will pick up our searching after we've
+                * taken this match.
+                *
+                * The minimum possible match has length 4, so the earliest ip0
+                * can be after we take this match will be the current ip0 + 4.
+                * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                * write this position.
+                */
+                hashTable[hash1] = (U32)(ip1 - base);
             }
+
+            goto _offset;
         }
 
         /* lookup ip[1] */
@@ -409,12 +405,12 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     goto _start;
 }
 
-#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
-    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \
+#define ZSTD_GEN_FAST_FN(dictMode, mls, cmov)                                                       \
+    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##cmov(                              \
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
             void const* src, size_t srcSize)                                                       \
     {                                                                                              \
-        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
+        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, cmov); \
     }
 
 ZSTD_GEN_FAST_FN(noDict, 4, 1)
@@ -432,8 +428,10 @@ size_t ZSTD_compressBlock_fast(
         void const* src, size_t srcSize)
 {
     U32 const mls = ms->cParams.minMatch;
+    /* use cmov instead of branch when the branch is likely unpredictable */
+    int const useCmov = 1;
     assert(ms->dictMatchState == NULL);
-    if (ms->cParams.targetLength > 1) {
+    if (useCmov) {
         switch(mls)
         {
         default: /* includes case 3 */
@@ -447,6 +445,7 @@ size_t ZSTD_compressBlock_fast(
             return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
         }
     } else {
+        /* use a branch instead */
         switch(mls)
         {
         default: /* includes case 3 */

From 186b1324951f2505469a3415aaf8ddc3398b9fca Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Tue, 8 Oct 2024 11:43:07 -0700
Subject: [PATCH 04/10] made search strategy switchable

between cmov and branch
and use a simple heuristic based on wlog to select between them.

note: performance is not good on clang (yet)
---
 lib/compress/zstd_compress_internal.h | 12 ++---
 lib/compress/zstd_fast.c              | 72 +++++++++++++++------------
 2 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index 64618848da..0be7ae14e7 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -558,21 +558,21 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
 }
 
 /* ZSTD_selectAddr:
- * @return a >= b ? trueAddr : falseAddr,
+ * @return index >= lowLimit ? candidate : backup,
  * tries to force branchless codegen. */
 MEM_STATIC const BYTE*
-ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* trueAddr, const BYTE* falseAddr)
+ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup)
 {
 #if defined(__GNUC__) && defined(__x86_64__)
     __asm__ (
         "cmp %1, %2\n"
         "cmova %3, %0\n"
-        : "+r"(trueAddr)
-        : "r"(index), "r"(lowLimit), "r"(falseAddr)
+        : "+r"(candidate)
+        : "r"(index), "r"(lowLimit), "r"(backup)
         );
-    return trueAddr;
+    return candidate;
 #else
-    return a >= b ? trueAddr : falseAddr;
+    return index >= lowLimit ? candidate : backup;
 #endif
 }
 
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 94e6300eb4..3c6e751314 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -45,7 +45,7 @@ void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
                 size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
                 if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
                     ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
-                }   }   }   }
+    }   }   }   }
 }
 
 static
@@ -97,17 +97,34 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
 }
 
 
+typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress);
+
 static int
-ZSTD_findMatch_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit,  const BYTE* fakeAddress)
+ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress)
 {
-    /* idx >= prefixStartIndex is a (somewhat) unpredictable branch.
-     * However expression below complies into conditional move. Since
-     * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
-     * if there is a match, all branches become predictable. */
+    /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
+     * However expression below compiles into conditional move.
+     */
     const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress);
     return ((MEM_read32(currentPtr) == MEM_read32(mvalAddr)) & (currentIdx >= lowLimit));
 }
 
+static int
+ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress)
+{
+    /* using a branch instead of a cmov,
+     * because it's faster in scenarios where currentIdx >= lowLimit is generally true,
+     * aka almost all candidates are within range */
+    U32 mval; (void)fakeAddress;
+    if (currentIdx >= lowLimit) {
+        mval = MEM_read32(matchAddress);
+    } else {
+        mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
+    }
+
+    return (MEM_read32(currentPtr) == mval);
+}
+
 
 /**
  * If you squint hard enough (and ignore repcodes), the search operation at any
@@ -160,7 +177,7 @@ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_compressBlock_fast_noDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const mls, U32 const unpredictable)
+        U32 const mls, int useCmov)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
@@ -204,7 +221,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     size_t step;
     const BYTE* nextStep;
     const size_t kStepIncr = (1 << (kSearchStrength - 1));
-    (void)unpredictable;
+    const ZSTD_match4Found findMatch = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
 
     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
     ip0 += (ip0 == prefixStart);
@@ -253,15 +270,15 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
             offcode = REPCODE1_TO_OFFBASE;
             mLength += 4;
 
-            /* First write next hash table entry; we've already calculated it.
-             * This write is known to be safe because the ip1 is before the
+            /* Write next hash table entry: it's already calculated.
+             * This write is known to be safe because ip1 is before the
              * repcode (ip2). */
             hashTable[hash1] = (U32)(ip1 - base);
 
             goto _match;
         }
 
-         if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) {
+         if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) {
             /* found a match! */
 
             /* Write next hash table entry (it's already calculated).
@@ -288,19 +305,13 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
-         if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) {
+         if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) {
             /* found a match! */
 
-            /* first write next hash table entry; it's already calculated */
+            /* Write next hash table entry; it's already calculated */
             if (step <= 4) {
-                /* We need to avoid writing an index into the hash table >= the
-                * position at which we will pick up our searching after we've
-                * taken this match.
-                *
-                * The minimum possible match has length 4, so the earliest ip0
-                * can be after we take this match will be the current ip0 + 4.
-                * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
-                * write this position.
+                /* Avoid writing an index if it's >= position where search will resume.
+                * The minimum possible match has length 4, so search can resume at ip0 + 4.
                 */
                 hashTable[hash1] = (U32)(ip1 - base);
             }
@@ -331,7 +342,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     } while (ip3 < ilimit);
 
 _cleanup:
-    /* Note that there are probably still a couple positions we could search.
+    /* Note that there are probably still a couple positions one could search.
      * However, it seems to be a meaningful performance hit to try to search
      * them. So let's not. */
 
@@ -405,12 +416,12 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     goto _start;
 }
 
-#define ZSTD_GEN_FAST_FN(dictMode, mls, cmov)                                                       \
-    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##cmov(                              \
+#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov)                                                       \
+    static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov(                              \
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
             void const* src, size_t srcSize)                                                       \
     {                                                                                              \
-        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, cmov); \
+        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
     }
 
 ZSTD_GEN_FAST_FN(noDict, 4, 1)
@@ -427,12 +438,12 @@ size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    U32 const mls = ms->cParams.minMatch;
-    /* use cmov instead of branch when the branch is likely unpredictable */
-    int const useCmov = 1;
+    U32 const mml = ms->cParams.minMatch;
+    /* use cmov when "candidate in range" branch is likely unpredictable */
+    int const useCmov = ms->cParams.windowLog < 19;
     assert(ms->dictMatchState == NULL);
     if (useCmov) {
-        switch(mls)
+        switch(mml)
         {
         default: /* includes case 3 */
         case 4 :
@@ -446,7 +457,7 @@ size_t ZSTD_compressBlock_fast(
         }
     } else {
         /* use a branch instead */
-        switch(mls)
+        switch(mml)
         {
         default: /* includes case 3 */
         case 4 :
@@ -458,7 +469,6 @@ size_t ZSTD_compressBlock_fast(
         case 7 :
             return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
         }
-
     }
 }
 

From 197c258a79a94b60ec45020defc5fb6a2f570e80 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Tue, 8 Oct 2024 15:54:48 -0700
Subject: [PATCH 05/10] introduce memory barrier to force test order

suggested by @terrelln
---
 lib/compress/zstd_fast.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 3c6e751314..9d0cefeac1 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -106,7 +106,15 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr
      * However expression below compiles into conditional move.
      */
     const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress);
-    return ((MEM_read32(currentPtr) == MEM_read32(mvalAddr)) & (currentIdx >= lowLimit));
+    /* Note: this used to be written as : return test1 && test2;
+     * Unfortunately, once inlined, these tests become branches,
+     * in which case it becomes critical that they are executed in the right order (test1 then test2).
+     * So we have to write these tests in a specific manner to ensure their ordering.
+     */
+    if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
+    /* force ordering of these tests, which matters once the function is inlined, as they become branches */
+    __asm__("");
+    return currentIdx >= lowLimit;
 }
 
 static int

From 741b860fc1e8171519a72bf3d30cdd20995b00ce Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Tue, 8 Oct 2024 16:02:54 -0700
Subject: [PATCH 06/10] store dummy bytes within ZSTD_match4Found_cmov()

feels more logical, better contained
---
 lib/compress/zstd_fast.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 9d0cefeac1..5bac5d9cfe 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -97,15 +97,20 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
 }
 
 
-typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress);
+typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit);
 
 static int
-ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress)
+ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit)
 {
+    /* Array of ~random data, should have low probability of matching data.
+     * Load from here if the index is invalid.
+     * Used to avoid unpredictable branches. */
+    static const BYTE dummy[] = {0x12,0x34,0x56,0x78 };
+
     /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
      * However expression below compiles into conditional move.
      */
-    const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress);
+    const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, dummy);
     /* Note: this used to be written as : return test1 && test2;
      * Unfortunately, once inlined, these tests become branches,
      * in which case it becomes critical that they are executed in the right order (test1 then test2).
@@ -118,12 +123,12 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr
 }
 
 static int
-ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress)
+ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit)
 {
     /* using a branch instead of a cmov,
      * because it's faster in scenarios where currentIdx >= lowLimit is generally true,
      * aka almost all candidates are within range */
-    U32 mval; (void)fakeAddress;
+    U32 mval;
     if (currentIdx >= lowLimit) {
         mval = MEM_read32(matchAddress);
     } else {
@@ -198,10 +203,6 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
-    /* Array of ~random data, should have low probability of matching data
-     * we load from here instead of from tables, if the index is invalid.
-     * Used to avoid unpredictable branches. */
-    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
 
     const BYTE* anchor = istart;
     const BYTE* ip0 = istart;
@@ -286,7 +287,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
             goto _match;
         }
 
-         if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) {
+         if (findMatch(ip0, base + idx, idx, prefixStartIndex)) {
             /* found a match! */
 
             /* Write next hash table entry (it's already calculated).
@@ -313,7 +314,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
-         if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) {
+         if (findMatch(ip0, base + idx, idx, prefixStartIndex)) {
             /* found a match! */
 
             /* Write next hash table entry; it's already calculated */

From d45aee43f4b1ac8ee0fbb182310b9b7622f85d1c Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Tue, 8 Oct 2024 16:38:35 -0700
Subject: [PATCH 07/10] make __asm__ a __GNUC__ specific

---
 lib/compress/zstd_fast.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 5bac5d9cfe..d4b8f06722 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -118,7 +118,9 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr
      */
     if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
     /* force ordering of these tests, which matters once the function is inlined, as they become branches */
+#if defined(__GNUC__)
     __asm__("");
+#endif
     return currentIdx >= lowLimit;
 }
 

From fa1fcb08ab447de8cdb4492d663779cad6841379 Mon Sep 17 00:00:00 2001
From: Yann Collet <yann.collet.73@gmail.com>
Date: Thu, 10 Oct 2024 16:07:20 -0700
Subject: [PATCH 08/10] minor: better variable naming

---
 lib/compress/zstd_fast.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index d4b8f06722..691bedc1e8 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -97,10 +97,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
 }
 
 
-typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit);
+typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
 
 static int
-ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit)
+ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
 {
     /* Array of ~random data, should have low probability of matching data.
      * Load from here if the index is invalid.
@@ -110,7 +110,7 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr
     /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
      * However expression below compiles into conditional move.
      */
-    const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, dummy);
+    const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
     /* Note: this used to be written as : return test1 && test2;
      * Unfortunately, once inlined, these tests become branches,
      * in which case it becomes critical that they are executed in the right order (test1 then test2).
@@ -121,17 +121,17 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr
 #if defined(__GNUC__)
     __asm__("");
 #endif
-    return currentIdx >= lowLimit;
+    return matchIdx >= idxLowLimit;
 }
 
 static int
-ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit)
+ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
 {
     /* using a branch instead of a cmov,
-     * because it's faster in scenarios where currentIdx >= lowLimit is generally true,
+     * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
      * aka almost all candidates are within range */
     U32 mval;
-    if (currentIdx >= lowLimit) {
+    if (matchIdx >= idxLowLimit) {
         mval = MEM_read32(matchAddress);
     } else {
         mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
@@ -219,7 +219,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
 
     size_t hash0; /* hash for ip0 */
     size_t hash1; /* hash for ip1 */
-    U32 idx; /* match idx for ip0 */
+    U32 matchIdx; /* match idx for ip0 */
 
     U32 offcode;
     const BYTE* match0;
@@ -261,7 +261,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     hash0 = ZSTD_hashPtr(ip0, hlog, mls);
     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
 
-    idx = hashTable[hash0];
+    matchIdx = hashTable[hash0];
 
     do {
         /* load repcode match for ip[2]*/
@@ -289,7 +289,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
             goto _match;
         }
 
-         if (findMatch(ip0, base + idx, idx, prefixStartIndex)) {
+         if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
             /* found a match! */
 
             /* Write next hash table entry (it's already calculated).
@@ -301,7 +301,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         }
 
         /* lookup ip[1] */
-        idx = hashTable[hash1];
+        matchIdx = hashTable[hash1];
 
         /* hash ip[2] */
         hash0 = hash1;
@@ -316,7 +316,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
-         if (findMatch(ip0, base + idx, idx, prefixStartIndex)) {
+         if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
             /* found a match! */
 
             /* Write next hash table entry; it's already calculated */
@@ -331,7 +331,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         }
 
         /* lookup ip[1] */
-        idx = hashTable[hash1];
+        matchIdx = hashTable[hash1];
 
         /* hash ip[2] */
         hash0 = hash1;
@@ -382,7 +382,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
 _offset: /* Requires: ip0, idx */
 
     /* Compute the offset code. */
-    match0 = base + idx;
+    match0 = base + matchIdx;
     rep_offset2 = rep_offset1;
     rep_offset1 = (U32)(ip0-match0);
     offcode = OFFSET_TO_OFFBASE(rep_offset1);

From 83de00316c29a7f5245a67733f6208c699b686c2 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 11 Oct 2024 15:36:15 -0700
Subject: [PATCH 09/10] fixed parameter ordering in `dfast`

noticed by @terrelln
---
 lib/compress/zstd_double_fast.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c
index 819658caf7..c04010b11d 100644
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@@ -201,7 +201,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
              * However expression below complies into conditional move. Since
              * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
              * if there is a match, all branches become predictable. */
-            matchl0_safe = ZSTD_selectAddr(prefixLowestIndex, idxl0, &dummy[0], matchl0);
+            matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]);
 
             /* check prefix long match */
             if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) {
@@ -215,7 +215,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             matchl1 = base + idxl1;
 
             /* Same optimization as matchl0 above */
-            matchs0_safe = ZSTD_selectAddr(prefixLowestIndex, idxs0, &dummy[0], matchs0);
+            matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]);
 
             /* check prefix short match */
             if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) {
@@ -662,7 +662,7 @@ size_t ZSTD_compressBlock_doubleFast_extDict_generic(
         size_t mLength;
         hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */
 
-        if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) 
+        if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
             & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;

From 8e5823b65c6d0d1eb1c07be8428f427df5892d3a Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 11 Oct 2024 15:38:12 -0700
Subject: [PATCH 10/10] rename variable name

findMatch -> matchFound
since it's a test, as opposed to an active search operation.
suggested by @terrelln
---
 lib/compress/zstd_fast.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 691bedc1e8..e7edf94902 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -105,7 +105,7 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matc
     /* Array of ~random data, should have low probability of matching data.
      * Load from here if the index is invalid.
      * Used to avoid unpredictable branches. */
-    static const BYTE dummy[] = {0x12,0x34,0x56,0x78 };
+    static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
 
     /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
      * However expression below compiles into conditional move.
@@ -232,7 +232,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     size_t step;
     const BYTE* nextStep;
     const size_t kStepIncr = (1 << (kSearchStrength - 1));
-    const ZSTD_match4Found findMatch = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
+    const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
 
     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
     ip0 += (ip0 == prefixStart);
@@ -289,9 +289,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
             goto _match;
         }
 
-         if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
-            /* found a match! */
-
+         if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
             /* Write next hash table entry (it's already calculated).
             * This write is known to be safe because the ip1 == ip0 + 1,
             * so searching will resume after ip1 */
@@ -316,17 +314,14 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
-         if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
-            /* found a match! */
-
-            /* Write next hash table entry; it's already calculated */
+         if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
+            /* Write next hash table entry, since it's already calculated */
             if (step <= 4) {
                 /* Avoid writing an index if it's >= position where search will resume.
                 * The minimum possible match has length 4, so search can resume at ip0 + 4.
                 */
                 hashTable[hash1] = (U32)(ip1 - base);
             }
-
             goto _offset;
         }