From a3a5dce5e8b370708559c1f82be77876c74196e8 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 12 Jun 2023 02:31:28 -0700 Subject: [PATCH] zstd: Add amd64 match length assembly (#824) * zstd: Add amd64 match length assembly Copied from the S2 implementation. 5-10% faster. --- zstd/blockdec.go | 2 +- zstd/matchlen_amd64.go | 16 ++++++++++ zstd/matchlen_amd64.s | 68 ++++++++++++++++++++++++++++++++++++++++ zstd/matchlen_generic.go | 33 +++++++++++++++++++ zstd/zstd.go | 22 ------------- 5 files changed, 118 insertions(+), 23 deletions(-) create mode 100644 zstd/matchlen_amd64.go create mode 100644 zstd/matchlen_amd64.s create mode 100644 zstd/matchlen_generic.go diff --git a/zstd/blockdec.go b/zstd/blockdec.go index 5f272d87f6..9f17ce601f 100644 --- a/zstd/blockdec.go +++ b/zstd/blockdec.go @@ -592,7 +592,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) { } seq.fse.setRLE(symb) if debugDecoder { - printf("RLE set to %+v, code: %v", symb, v) + printf("RLE set to 0x%x, code: %v", symb, v) } case compModeFSE: println("Reading table for", tableIndex(i)) diff --git a/zstd/matchlen_amd64.go b/zstd/matchlen_amd64.go new file mode 100644 index 0000000000..f41932b7a4 --- /dev/null +++ b/zstd/matchlen_amd64.go @@ -0,0 +1,16 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +// Copyright 2019+ Klaus Post. All rights reserved. +// License information can be found in the LICENSE file. + +package zstd + +// matchLen returns how many bytes match in a and b +// +// It assumes that: +// +// len(a) <= len(b) and len(a) > 0 +// +//go:noescape +func matchLen(a []byte, b []byte) int diff --git a/zstd/matchlen_amd64.s b/zstd/matchlen_amd64.s new file mode 100644 index 0000000000..9a7655c0f7 --- /dev/null +++ b/zstd/matchlen_amd64.s @@ -0,0 +1,68 @@ +// Copied from S2 implementation. + +//go:build !appengine && !noasm && gc && !noasm + +#include "textflag.h" + +// func matchLen(a []byte, b []byte) int +// Requires: BMI +TEXT ·matchLen(SB), NOSPLIT, $0-56 + MOVQ a_base+0(FP), AX + MOVQ b_base+24(FP), CX + MOVQ a_len+8(FP), DX + + // matchLen + XORL SI, SI + CMPL DX, $0x08 + JB matchlen_match4_standalone + +matchlen_loopback_standalone: + MOVQ (AX)(SI*1), BX + XORQ (CX)(SI*1), BX + TESTQ BX, BX + JZ matchlen_loop_standalone + +#ifdef GOAMD64_v3 + TZCNTQ BX, BX +#else + BSFQ BX, BX +#endif + SARQ $0x03, BX + LEAL (SI)(BX*1), SI + JMP gen_match_len_end + +matchlen_loop_standalone: + LEAL -8(DX), DX + LEAL 8(SI), SI + CMPL DX, $0x08 + JAE matchlen_loopback_standalone + +matchlen_match4_standalone: + CMPL DX, $0x04 + JB matchlen_match2_standalone + MOVL (AX)(SI*1), BX + CMPL (CX)(SI*1), BX + JNE matchlen_match2_standalone + LEAL -4(DX), DX + LEAL 4(SI), SI + +matchlen_match2_standalone: + CMPL DX, $0x02 + JB matchlen_match1_standalone + MOVW (AX)(SI*1), BX + CMPW (CX)(SI*1), BX + JNE matchlen_match1_standalone + LEAL -2(DX), DX + LEAL 2(SI), SI + +matchlen_match1_standalone: + CMPL DX, $0x01 + JB gen_match_len_end + MOVB (AX)(SI*1), BL + CMPB (CX)(SI*1), BL + JNE gen_match_len_end + INCL SI + +gen_match_len_end: + MOVQ SI, ret+48(FP) + RET diff --git a/zstd/matchlen_generic.go b/zstd/matchlen_generic.go new file mode 100644 index 0000000000..57b9c31c02 --- /dev/null +++ b/zstd/matchlen_generic.go @@ -0,0 +1,33 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +// Copyright 2019+ Klaus Post. All rights reserved. +// License information can be found in the LICENSE file. + +package zstd + +import ( + "encoding/binary" + "math/bits" +) + +// matchLen returns the maximum common prefix length of a and b. +// a must be the shortest of the two. +func matchLen(a, b []byte) (n int) { + for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] { + diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b) + if diff != 0 { + return n + bits.TrailingZeros64(diff)>>3 + } + n += 8 + } + + for i := range a { + if a[i] != b[i] { + break + } + n++ + } + return n + +} diff --git a/zstd/zstd.go b/zstd/zstd.go index 89396673d9..4be7cc7367 100644 --- a/zstd/zstd.go +++ b/zstd/zstd.go @@ -9,7 +9,6 @@ import ( "errors" "log" "math" - "math/bits" ) // enable debug printing @@ -106,27 +105,6 @@ func printf(format string, a ...interface{}) { } } -// matchLen returns the maximum common prefix length of a and b. -// a must be the shortest of the two. -func matchLen(a, b []byte) (n int) { - for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] { - diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b) - if diff != 0 { - return n + bits.TrailingZeros64(diff)>>3 - } - n += 8 - } - - for i := range a { - if a[i] != b[i] { - break - } - n++ - } - return n - -} - func load3232(b []byte, i int32) uint32 { return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:]) }