From e8fce38954efd6bc58a5b0ec72dd26f41c8365e6 Mon Sep 17 00:00:00 2001 From: Ilya Tokar Date: Wed, 18 Sep 2024 17:36:37 -0400 Subject: [PATCH 01/10] Optimize compression by avoiding unpredictable branches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid unpredictable branch. Use conditional move to generate the address that is guaranteed to be safe and compare unconditionally. Instead of if (idx < limit && x[idx] == val ) // mispredicted idx < limit branch Do addr = cmov(safe,x+idx) if (*addr == val && idx < limit) // almost always false so well predicted Using microbenchmarks from https://github.com/google/fleetbench, I get about ~10% speed-up: name old cpu/op new cpu/op delta BM_ZSTD_COMPRESS_Fleet/compression_level:-7/window_log:15 1.46ns ± 3% 1.31ns ± 7% -9.88% (p=0.000 n=35+38) BM_ZSTD_COMPRESS_Fleet/compression_level:-7/window_log:16 1.41ns ± 3% 1.28ns ± 3% -9.56% (p=0.000 n=36+39) BM_ZSTD_COMPRESS_Fleet/compression_level:-5/window_log:15 1.61ns ± 1% 1.43ns ± 3% -10.70% (p=0.000 n=30+39) BM_ZSTD_COMPRESS_Fleet/compression_level:-5/window_log:16 1.54ns ± 2% 1.39ns ± 3% -9.21% (p=0.000 n=37+39) BM_ZSTD_COMPRESS_Fleet/compression_level:-3/window_log:15 1.82ns ± 2% 1.61ns ± 3% -11.31% (p=0.000 n=37+40) BM_ZSTD_COMPRESS_Fleet/compression_level:-3/window_log:16 1.73ns ± 3% 1.56ns ± 3% -9.50% (p=0.000 n=38+39) BM_ZSTD_COMPRESS_Fleet/compression_level:-1/window_log:15 2.12ns ± 2% 1.79ns ± 3% -15.55% (p=0.000 n=34+39) BM_ZSTD_COMPRESS_Fleet/compression_level:-1/window_log:16 1.99ns ± 3% 1.72ns ± 3% -13.70% (p=0.000 n=38+38) BM_ZSTD_COMPRESS_Fleet/compression_level:0/window_log:15 3.22ns ± 3% 2.94ns ± 3% -8.67% (p=0.000 n=38+40) BM_ZSTD_COMPRESS_Fleet/compression_level:0/window_log:16 3.19ns ± 4% 2.86ns ± 4% -10.55% (p=0.000 n=40+38) BM_ZSTD_COMPRESS_Fleet/compression_level:1/window_log:15 2.60ns ± 3% 2.22ns ± 3% -14.53% (p=0.000 n=40+39) BM_ZSTD_COMPRESS_Fleet/compression_level:1/window_log:16 2.46ns ± 3% 2.13ns ± 2% -13.67% (p=0.000 n=39+36) BM_ZSTD_COMPRESS_Fleet/compression_level:2/window_log:15 2.69ns ± 3% 2.46ns ± 3% -8.63% (p=0.000 n=37+39) BM_ZSTD_COMPRESS_Fleet/compression_level:2/window_log:16 2.63ns ± 3% 2.36ns ± 3% -10.47% (p=0.000 n=40+40) BM_ZSTD_COMPRESS_Fleet/compression_level:3/window_log:15 3.20ns ± 2% 2.95ns ± 3% -7.94% (p=0.000 n=35+40) BM_ZSTD_COMPRESS_Fleet/compression_level:3/window_log:16 3.20ns ± 4% 2.87ns ± 4% -10.33% (p=0.000 n=40+40) I've also measured the impact on internal workloads and saw similar ~10% improvement in performance, measured by cpu usage/byte of data. --- lib/compress/zstd_compress_internal.h | 17 ++++++++++++ lib/compress/zstd_double_fast.c | 37 +++++++++++++++++---------- lib/compress/zstd_fast.c | 32 ++++++++++++++--------- 3 files changed, 61 insertions(+), 25 deletions(-) diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index ba1450852e..cbcded28db 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -557,6 +557,23 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) return 1; } +/* ZSTD_selectAddr: + * @return a >= b ? trueAddr : falseAddr, + * tries to force branchless codegen. */ +MEM_STATIC const BYTE* ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ( + "cmp %1, %2\n" + "cmova %3, %0\n" + : "+r"(trueAddr) + : "r"(a), "r"(b), "r"(falseAddr) + ); + return trueAddr; +#else + return a >= b ? trueAddr : falseAddr; +#endif +} + /* ZSTD_noCompressBlock() : * Writes uncompressed block to dst buffer from given src. * Returns the size of the block */ diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c index 1b5f64f20b..819658caf7 100644 --- a/lib/compress/zstd_double_fast.c +++ b/lib/compress/zstd_double_fast.c @@ -140,11 +140,17 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( U32 idxl1; /* the long match index for ip1 */ const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchl0_safe; /* matchl0 or safe address */ const BYTE* matchs0; /* the short match for ip */ const BYTE* matchl1; /* the long match for ip1 */ + const BYTE* matchs0_safe; /* matchs0 or safe address */ const BYTE* ip = istart; /* the current position */ const BYTE* ip1; /* the next position */ + /* Array of ~random data, should have low probability of matching data + * we load from here instead of from tables, if matchl0/matchl1 are + * invalid indices. Used to avoid unpredictable branches. */ + const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); @@ -191,24 +197,29 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); - if (idxl0 > prefixLowestIndex) { - /* check prefix long match */ - if (MEM_read64(matchl0) == MEM_read64(ip)) { - mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; - offset = (U32)(ip-matchl0); - while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ - goto _match_found; - } + /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch. + * However expression below complies into conditional move. Since + * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex + * if there is a match, all branches become predictable. */ + matchl0_safe = ZSTD_selectAddr(prefixLowestIndex, idxl0, &dummy[0], matchl0); + + /* check prefix long match */ + if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; } idxl1 = hashLong[hl1]; matchl1 = base + idxl1; - if (idxs0 > prefixLowestIndex) { - /* check prefix short match */ - if (MEM_read32(matchs0) == MEM_read32(ip)) { - goto _search_next_long; - } + /* Same optimization as matchl0 above */ + matchs0_safe = ZSTD_selectAddr(prefixLowestIndex, idxs0, &dummy[0], matchs0); + + /* check prefix short match */ + if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { + goto _search_next_long; } if (ip1 >= nextStep) { diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index c6baa49d4d..838a18ee5c 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -162,6 +162,11 @@ size_t ZSTD_compressBlock_fast_noDict_generic( const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; + /* Array of ~random data, should have low probability of matching data + * we load from here instead of from tables, if the index is invalid. + * Used to avoid unpredictable branches. */ + const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; + const BYTE *mvalAddr; const BYTE* anchor = istart; const BYTE* ip0 = istart; @@ -246,15 +251,18 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _match; } + /* idx >= prefixStartIndex is a (somewhat) unpredictable branch. + * However expression below complies into conditional move. Since + * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex + * if there is a match, all branches become predictable. */ + mvalAddr = base + idx; + mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]); + /* load match for ip[0] */ - if (idx >= prefixStartIndex) { - mval = MEM_read32(base + idx); - } else { - mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ - } + mval = MEM_read32(mvalAddr); /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { + if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { /* found a match! */ /* First write next hash table entry; we've already calculated it. @@ -281,15 +289,15 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; + mvalAddr = base + idx; + mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]); + /* load match for ip[0] */ - if (idx >= prefixStartIndex) { - mval = MEM_read32(base + idx); - } else { - mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ - } + mval = MEM_read32(mvalAddr); + /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { + if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { /* found a match! */ /* first write next hash table entry; we've already calculated it */ From 1e7fa242f4aa71c3aec5e1e39ec69ad54e117051 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 7 Oct 2024 11:22:40 -0700 Subject: [PATCH 02/10] minor refactor zstd_fast make hot variables more local --- lib/compress/zstd_compress_internal.h | 4 +- lib/compress/zstd_fast.c | 71 +++++++++++++-------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index cbcded28db..53c3bdee8b 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -560,7 +560,9 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) /* ZSTD_selectAddr: * @return a >= b ? trueAddr : falseAddr, * tries to force branchless codegen. */ -MEM_STATIC const BYTE* ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) { +MEM_STATIC const BYTE* +ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) +{ #if defined(__GNUC__) && defined(__x86_64__) __asm__ ( "cmp %1, %2\n" diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 838a18ee5c..0b6230150a 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -166,7 +166,6 @@ size_t ZSTD_compressBlock_fast_noDict_generic( * we load from here instead of from tables, if the index is invalid. * Used to avoid unpredictable branches. */ const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; - const BYTE *mvalAddr; const BYTE* anchor = istart; const BYTE* ip0 = istart; @@ -182,7 +181,6 @@ size_t ZSTD_compressBlock_fast_noDict_generic( size_t hash0; /* hash for ip0 */ size_t hash1; /* hash for ip1 */ U32 idx; /* match idx for ip0 */ - U32 mval; /* src value at match idx */ U32 offcode; const BYTE* match0; @@ -255,22 +253,21 @@ size_t ZSTD_compressBlock_fast_noDict_generic( * However expression below complies into conditional move. Since * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex * if there is a match, all branches become predictable. */ - mvalAddr = base + idx; - mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]); + { const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]); + /* load match for ip[0] */ + U32 const mval = MEM_read32(mvalAddr); - /* load match for ip[0] */ - mval = MEM_read32(mvalAddr); - - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { - /* found a match! */ + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { + /* found a match! */ - /* First write next hash table entry; we've already calculated it. - * This write is known to be safe because the ip1 == ip0 + 1, so - * we know we will resume searching after ip1 */ - hashTable[hash1] = (U32)(ip1 - base); + /* Write next hash table entry (it's already calculated). + * This write is known to be safe because the ip1 == ip0 + 1, + * so searching will resume after ip1 */ + hashTable[hash1] = (U32)(ip1 - base); - goto _offset; + goto _offset; + } } /* lookup ip[1] */ @@ -289,32 +286,30 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; - mvalAddr = base + idx; - mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]); - - /* load match for ip[0] */ - mval = MEM_read32(mvalAddr); + { const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]); + /* load match for ip[0] */ + U32 const mval = MEM_read32(mvalAddr); + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { + /* found a match! */ - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { - /* found a match! */ + /* first write next hash table entry; we've already calculated it */ + if (step <= 4) { + /* We need to avoid writing an index into the hash table >= the + * position at which we will pick up our searching after we've + * taken this match. + * + * The minimum possible match has length 4, so the earliest ip0 + * can be after we take this match will be the current ip0 + 4. + * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely + * write this position. + */ + hashTable[hash1] = (U32)(ip1 - base); + } - /* first write next hash table entry; we've already calculated it */ - if (step <= 4) { - /* We need to avoid writing an index into the hash table >= the - * position at which we will pick up our searching after we've - * taken this match. - * - * The minimum possible match has length 4, so the earliest ip0 - * can be after we take this match will be the current ip0 + 4. - * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely - * write this position. - */ - hashTable[hash1] = (U32)(ip1 - base); + goto _offset; } - - goto _offset; } /* lookup ip[1] */ @@ -554,7 +549,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); hashTable[hash0] = curr; /* update hash table */ - if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; From 2cc600bab21657ccf966ceadfeb316e2eacff25c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 8 Oct 2024 11:10:48 -0700 Subject: [PATCH 03/10] refactor search into an inline function for easier swapping with a parameter --- lib/compress/zstd_compress_internal.h | 4 +- lib/compress/zstd_fast.c | 93 +++++++++++++-------------- 2 files changed, 48 insertions(+), 49 deletions(-) diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 53c3bdee8b..64618848da 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -561,14 +561,14 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) * @return a >= b ? trueAddr : falseAddr, * tries to force branchless codegen. */ MEM_STATIC const BYTE* -ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) +ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* trueAddr, const BYTE* falseAddr) { #if defined(__GNUC__) && defined(__x86_64__) __asm__ ( "cmp %1, %2\n" "cmova %3, %0\n" : "+r"(trueAddr) - : "r"(a), "r"(b), "r"(falseAddr) + : "r"(index), "r"(lowLimit), "r"(falseAddr) ); return trueAddr; #else diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 0b6230150a..94e6300eb4 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -97,6 +97,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, } +static int +ZSTD_findMatch_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress) +{ + /* idx >= prefixStartIndex is a (somewhat) unpredictable branch. + * However expression below complies into conditional move. Since + * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex + * if there is a match, all branches become predictable. */ + const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress); + return ((MEM_read32(currentPtr) == MEM_read32(mvalAddr)) & (currentIdx >= lowLimit)); +} + + /** * If you squint hard enough (and ignore repcodes), the search operation at any * given position is broken into 4 stages: @@ -148,13 +160,12 @@ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_fast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls, U32 const hasStep) + U32 const mls, U32 const unpredictable) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; U32 const hlog = cParams->hashLog; - /* support stepSize of 0 */ - size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; + size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */ const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); @@ -193,6 +204,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( size_t step; const BYTE* nextStep; const size_t kStepIncr = (1 << (kSearchStrength - 1)); + (void)unpredictable; DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); ip0 += (ip0 == prefixStart); @@ -249,25 +261,15 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _match; } - /* idx >= prefixStartIndex is a (somewhat) unpredictable branch. - * However expression below complies into conditional move. Since - * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex - * if there is a match, all branches become predictable. */ - { const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]); - /* load match for ip[0] */ - U32 const mval = MEM_read32(mvalAddr); + if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) { + /* found a match! */ - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { - /* found a match! */ - - /* Write next hash table entry (it's already calculated). - * This write is known to be safe because the ip1 == ip0 + 1, - * so searching will resume after ip1 */ - hashTable[hash1] = (U32)(ip1 - base); + /* Write next hash table entry (it's already calculated). + * This write is known to be safe because the ip1 == ip0 + 1, + * so searching will resume after ip1 */ + hashTable[hash1] = (U32)(ip1 - base); - goto _offset; - } + goto _offset; } /* lookup ip[1] */ @@ -286,30 +288,24 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; - { const BYTE* mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, base + idx, &dummy[0]); - /* load match for ip[0] */ - U32 const mval = MEM_read32(mvalAddr); - - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) { - /* found a match! */ - - /* first write next hash table entry; we've already calculated it */ - if (step <= 4) { - /* We need to avoid writing an index into the hash table >= the - * position at which we will pick up our searching after we've - * taken this match. - * - * The minimum possible match has length 4, so the earliest ip0 - * can be after we take this match will be the current ip0 + 4. - * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely - * write this position. - */ - hashTable[hash1] = (U32)(ip1 - base); - } - - goto _offset; + if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) { + /* found a match! */ + + /* first write next hash table entry; it's already calculated */ + if (step <= 4) { + /* We need to avoid writing an index into the hash table >= the + * position at which we will pick up our searching after we've + * taken this match. + * + * The minimum possible match has length 4, so the earliest ip0 + * can be after we take this match will be the current ip0 + 4. + * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely + * write this position. + */ + hashTable[hash1] = (U32)(ip1 - base); } + + goto _offset; } /* lookup ip[1] */ @@ -409,12 +405,12 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _start; } -#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ - static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ +#define ZSTD_GEN_FAST_FN(dictMode, mls, cmov) \ + static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##cmov( \ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ void const* src, size_t srcSize) \ { \ - return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, cmov); \ } ZSTD_GEN_FAST_FN(noDict, 4, 1) @@ -432,8 +428,10 @@ size_t ZSTD_compressBlock_fast( void const* src, size_t srcSize) { U32 const mls = ms->cParams.minMatch; + /* use cmov instead of branch when the branch is likely unpredictable */ + int const useCmov = 1; assert(ms->dictMatchState == NULL); - if (ms->cParams.targetLength > 1) { + if (useCmov) { switch(mls) { default: /* includes case 3 */ @@ -447,6 +445,7 @@ size_t ZSTD_compressBlock_fast( return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); } } else { + /* use a branch instead */ switch(mls) { default: /* includes case 3 */ From 186b1324951f2505469a3415aaf8ddc3398b9fca Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 8 Oct 2024 11:43:07 -0700 Subject: [PATCH 04/10] made search strategy switchable between cmov and branch and use a simple heuristic based on wlog to select between them. note: performance is not good on clang (yet) --- lib/compress/zstd_compress_internal.h | 12 ++--- lib/compress/zstd_fast.c | 72 +++++++++++++++------------ 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 64618848da..0be7ae14e7 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -558,21 +558,21 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) } /* ZSTD_selectAddr: - * @return a >= b ? trueAddr : falseAddr, + * @return index >= lowLimit ? candidate : backup, * tries to force branchless codegen. */ MEM_STATIC const BYTE* -ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* trueAddr, const BYTE* falseAddr) +ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup) { #if defined(__GNUC__) && defined(__x86_64__) __asm__ ( "cmp %1, %2\n" "cmova %3, %0\n" - : "+r"(trueAddr) - : "r"(index), "r"(lowLimit), "r"(falseAddr) + : "+r"(candidate) + : "r"(index), "r"(lowLimit), "r"(backup) ); - return trueAddr; + return candidate; #else - return a >= b ? trueAddr : falseAddr; + return index >= lowLimit ? candidate : backup; #endif } diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 94e6300eb4..3c6e751314 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -45,7 +45,7 @@ void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); - } } } } + } } } } } static @@ -97,17 +97,34 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, } +typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress); + static int -ZSTD_findMatch_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress) +ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress) { - /* idx >= prefixStartIndex is a (somewhat) unpredictable branch. - * However expression below complies into conditional move. Since - * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex - * if there is a match, all branches become predictable. */ + /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. + * However expression below compiles into conditional move. + */ const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress); return ((MEM_read32(currentPtr) == MEM_read32(mvalAddr)) & (currentIdx >= lowLimit)); } +static int +ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress) +{ + /* using a branch instead of a cmov, + * because it's faster in scenarios where currentIdx >= lowLimit is generally true, + * aka almost all candidates are within range */ + U32 mval; (void)fakeAddress; + if (currentIdx >= lowLimit) { + mval = MEM_read32(matchAddress); + } else { + mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */ + } + + return (MEM_read32(currentPtr) == mval); +} + /** * If you squint hard enough (and ignore repcodes), the search operation at any @@ -160,7 +177,7 @@ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_fast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls, U32 const unpredictable) + U32 const mls, int useCmov) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; @@ -204,7 +221,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( size_t step; const BYTE* nextStep; const size_t kStepIncr = (1 << (kSearchStrength - 1)); - (void)unpredictable; + const ZSTD_match4Found findMatch = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); ip0 += (ip0 == prefixStart); @@ -253,15 +270,15 @@ size_t ZSTD_compressBlock_fast_noDict_generic( offcode = REPCODE1_TO_OFFBASE; mLength += 4; - /* First write next hash table entry; we've already calculated it. - * This write is known to be safe because the ip1 is before the + /* Write next hash table entry: it's already calculated. + * This write is known to be safe because ip1 is before the * repcode (ip2). */ hashTable[hash1] = (U32)(ip1 - base); goto _match; } - if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) { + if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) { /* found a match! */ /* Write next hash table entry (it's already calculated). @@ -288,19 +305,13 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; - if (ZSTD_findMatch_cmov(ip0, base + idx, idx, prefixStartIndex, dummy)) { + if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) { /* found a match! */ - /* first write next hash table entry; it's already calculated */ + /* Write next hash table entry; it's already calculated */ if (step <= 4) { - /* We need to avoid writing an index into the hash table >= the - * position at which we will pick up our searching after we've - * taken this match. - * - * The minimum possible match has length 4, so the earliest ip0 - * can be after we take this match will be the current ip0 + 4. - * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely - * write this position. + /* Avoid writing an index if it's >= position where search will resume. + * The minimum possible match has length 4, so search can resume at ip0 + 4. */ hashTable[hash1] = (U32)(ip1 - base); } @@ -331,7 +342,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( } while (ip3 < ilimit); _cleanup: - /* Note that there are probably still a couple positions we could search. + /* Note that there are probably still a couple positions one could search. * However, it seems to be a meaningful performance hit to try to search * them. So let's not. */ @@ -405,12 +416,12 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _start; } -#define ZSTD_GEN_FAST_FN(dictMode, mls, cmov) \ - static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##cmov( \ +#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \ + static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ void const* src, size_t srcSize) \ { \ - return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, cmov); \ + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \ } ZSTD_GEN_FAST_FN(noDict, 4, 1) @@ -427,12 +438,12 @@ size_t ZSTD_compressBlock_fast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - U32 const mls = ms->cParams.minMatch; - /* use cmov instead of branch when the branch is likely unpredictable */ - int const useCmov = 1; + U32 const mml = ms->cParams.minMatch; + /* use cmov when "candidate in range" branch is likely unpredictable */ + int const useCmov = ms->cParams.windowLog < 19; assert(ms->dictMatchState == NULL); if (useCmov) { - switch(mls) + switch(mml) { default: /* includes case 3 */ case 4 : @@ -446,7 +457,7 @@ size_t ZSTD_compressBlock_fast( } } else { /* use a branch instead */ - switch(mls) + switch(mml) { default: /* includes case 3 */ case 4 : @@ -458,7 +469,6 @@ size_t ZSTD_compressBlock_fast( case 7 : return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); } - } } From 197c258a79a94b60ec45020defc5fb6a2f570e80 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 8 Oct 2024 15:54:48 -0700 Subject: [PATCH 05/10] introduce memory barrier to force test order suggested by @terrelln --- lib/compress/zstd_fast.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 3c6e751314..9d0cefeac1 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -106,7 +106,15 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr * However expression below compiles into conditional move. */ const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress); - return ((MEM_read32(currentPtr) == MEM_read32(mvalAddr)) & (currentIdx >= lowLimit)); + /* Note: this used to be written as : return test1 && test2; + * Unfortunately, once inlined, these tests become branches, + * in which case it becomes critical that they are executed in the right order (test1 then test2). + * So we have to write these tests in a specific manner to ensure their ordering. + */ + if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; + /* force ordering of these tests, which matters once the function is inlined, as they become branches */ + __asm__(""); + return currentIdx >= lowLimit; } static int From 741b860fc1e8171519a72bf3d30cdd20995b00ce Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 8 Oct 2024 16:02:54 -0700 Subject: [PATCH 06/10] store dummy bytes within ZSTD_match4Found_cmov() feels more logical, better contained --- lib/compress/zstd_fast.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 9d0cefeac1..5bac5d9cfe 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -97,15 +97,20 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, } -typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress); +typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit); static int -ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress) +ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit) { + /* Array of ~random data, should have low probability of matching data. + * Load from here if the index is invalid. + * Used to avoid unpredictable branches. */ + static const BYTE dummy[] = {0x12,0x34,0x56,0x78 }; + /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. * However expression below compiles into conditional move. */ - const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, fakeAddress); + const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, dummy); /* Note: this used to be written as : return test1 && test2; * Unfortunately, once inlined, these tests become branches, * in which case it becomes critical that they are executed in the right order (test1 then test2). @@ -118,12 +123,12 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr } static int -ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit, const BYTE* fakeAddress) +ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit) { /* using a branch instead of a cmov, * because it's faster in scenarios where currentIdx >= lowLimit is generally true, * aka almost all candidates are within range */ - U32 mval; (void)fakeAddress; + U32 mval; if (currentIdx >= lowLimit) { mval = MEM_read32(matchAddress); } else { @@ -198,10 +203,6 @@ size_t ZSTD_compressBlock_fast_noDict_generic( const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; - /* Array of ~random data, should have low probability of matching data - * we load from here instead of from tables, if the index is invalid. - * Used to avoid unpredictable branches. */ - const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; const BYTE* anchor = istart; const BYTE* ip0 = istart; @@ -286,7 +287,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _match; } - if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) { + if (findMatch(ip0, base + idx, idx, prefixStartIndex)) { /* found a match! */ /* Write next hash table entry (it's already calculated). @@ -313,7 +314,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; - if (findMatch(ip0, base + idx, idx, prefixStartIndex, dummy)) { + if (findMatch(ip0, base + idx, idx, prefixStartIndex)) { /* found a match! */ /* Write next hash table entry; it's already calculated */ From d45aee43f4b1ac8ee0fbb182310b9b7622f85d1c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 8 Oct 2024 16:38:35 -0700 Subject: [PATCH 07/10] make __asm__ a __GNUC__ specific --- lib/compress/zstd_fast.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 5bac5d9cfe..d4b8f06722 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -118,7 +118,9 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr */ if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; /* force ordering of these tests, which matters once the function is inlined, as they become branches */ +#if defined(__GNUC__) __asm__(""); +#endif return currentIdx >= lowLimit; } From fa1fcb08ab447de8cdb4492d663779cad6841379 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 10 Oct 2024 16:07:20 -0700 Subject: [PATCH 08/10] minor: better variable naming --- lib/compress/zstd_fast.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index d4b8f06722..691bedc1e8 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -97,10 +97,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, } -typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit); +typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit); static int -ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit) +ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) { /* Array of ~random data, should have low probability of matching data. * Load from here if the index is invalid. @@ -110,7 +110,7 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. * However expression below compiles into conditional move. */ - const BYTE* mvalAddr = ZSTD_selectAddr(currentIdx, lowLimit, matchAddress, dummy); + const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy); /* Note: this used to be written as : return test1 && test2; * Unfortunately, once inlined, these tests become branches, * in which case it becomes critical that they are executed in the right order (test1 then test2). @@ -121,17 +121,17 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 curr #if defined(__GNUC__) __asm__(""); #endif - return currentIdx >= lowLimit; + return matchIdx >= idxLowLimit; } static int -ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 currentIdx, U32 lowLimit) +ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) { /* using a branch instead of a cmov, - * because it's faster in scenarios where currentIdx >= lowLimit is generally true, + * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true, * aka almost all candidates are within range */ U32 mval; - if (currentIdx >= lowLimit) { + if (matchIdx >= idxLowLimit) { mval = MEM_read32(matchAddress); } else { mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */ @@ -219,7 +219,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( size_t hash0; /* hash for ip0 */ size_t hash1; /* hash for ip1 */ - U32 idx; /* match idx for ip0 */ + U32 matchIdx; /* match idx for ip0 */ U32 offcode; const BYTE* match0; @@ -261,7 +261,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( hash0 = ZSTD_hashPtr(ip0, hlog, mls); hash1 = ZSTD_hashPtr(ip1, hlog, mls); - idx = hashTable[hash0]; + matchIdx = hashTable[hash0]; do { /* load repcode match for ip[2]*/ @@ -289,7 +289,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _match; } - if (findMatch(ip0, base + idx, idx, prefixStartIndex)) { + if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { /* found a match! */ /* Write next hash table entry (it's already calculated). @@ -301,7 +301,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( } /* lookup ip[1] */ - idx = hashTable[hash1]; + matchIdx = hashTable[hash1]; /* hash ip[2] */ hash0 = hash1; @@ -316,7 +316,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; - if (findMatch(ip0, base + idx, idx, prefixStartIndex)) { + if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { /* found a match! */ /* Write next hash table entry; it's already calculated */ @@ -331,7 +331,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( } /* lookup ip[1] */ - idx = hashTable[hash1]; + matchIdx = hashTable[hash1]; /* hash ip[2] */ hash0 = hash1; @@ -382,7 +382,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( _offset: /* Requires: ip0, idx */ /* Compute the offset code. */ - match0 = base + idx; + match0 = base + matchIdx; rep_offset2 = rep_offset1; rep_offset1 = (U32)(ip0-match0); offcode = OFFSET_TO_OFFBASE(rep_offset1); From 83de00316c29a7f5245a67733f6208c699b686c2 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 11 Oct 2024 15:36:15 -0700 Subject: [PATCH 09/10] fixed parameter ordering in `dfast` noticed by @terrelln --- lib/compress/zstd_double_fast.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c index 819658caf7..c04010b11d 100644 --- a/lib/compress/zstd_double_fast.c +++ b/lib/compress/zstd_double_fast.c @@ -201,7 +201,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( * However expression below complies into conditional move. Since * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex * if there is a match, all branches become predictable. */ - matchl0_safe = ZSTD_selectAddr(prefixLowestIndex, idxl0, &dummy[0], matchl0); + matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]); /* check prefix long match */ if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { @@ -215,7 +215,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( matchl1 = base + idxl1; /* Same optimization as matchl0 above */ - matchs0_safe = ZSTD_selectAddr(prefixLowestIndex, idxs0, &dummy[0], matchs0); + matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]); /* check prefix short match */ if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { @@ -662,7 +662,7 @@ size_t ZSTD_compressBlock_doubleFast_extDict_generic( size_t mLength; hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ - if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; From 8e5823b65c6d0d1eb1c07be8428f427df5892d3a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 11 Oct 2024 15:38:12 -0700 Subject: [PATCH 10/10] rename variable name findMatch -> matchFound since it's a test, as opposed to an active search operation. suggested by @terrelln --- lib/compress/zstd_fast.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 691bedc1e8..e7edf94902 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -105,7 +105,7 @@ ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matc /* Array of ~random data, should have low probability of matching data. * Load from here if the index is invalid. * Used to avoid unpredictable branches. */ - static const BYTE dummy[] = {0x12,0x34,0x56,0x78 }; + static const BYTE dummy[] = {0x12,0x34,0x56,0x78}; /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. * However expression below compiles into conditional move. @@ -232,7 +232,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( size_t step; const BYTE* nextStep; const size_t kStepIncr = (1 << (kSearchStrength - 1)); - const ZSTD_match4Found findMatch = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; + const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); ip0 += (ip0 == prefixStart); @@ -289,9 +289,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic( goto _match; } - if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { - /* found a match! */ - + if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { /* Write next hash table entry (it's already calculated). * This write is known to be safe because the ip1 == ip0 + 1, * so searching will resume after ip1 */ @@ -316,17 +314,14 @@ size_t ZSTD_compressBlock_fast_noDict_generic( current0 = (U32)(ip0 - base); hashTable[hash0] = current0; - if (findMatch(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { - /* found a match! */ - - /* Write next hash table entry; it's already calculated */ + if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { + /* Write next hash table entry, since it's already calculated */ if (step <= 4) { /* Avoid writing an index if it's >= position where search will resume. * The minimum possible match has length 4, so search can resume at ip0 + 4. */ hashTable[hash1] = (U32)(ip1 - base); } - goto _offset; }