From 68c93109d0e4b098f0866fb1eebd0c215dfcc55f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 25 Feb 2021 01:53:51 -0800 Subject: [PATCH] s2: Add AMD64 assembly for better mode (#315) Blocks: ``` benchmark old ns/op new ns/op delta BenchmarkTwainEncode1e1/better-32 10.7 10.5 -1.87% BenchmarkTwainEncode1e2/better-32 2947 280 -90.50% BenchmarkTwainEncode1e3/better-32 6664 2525 -62.11% BenchmarkTwainEncode1e4/better-32 47401 25461 -46.29% BenchmarkTwainEncode1e5/better-32 528060 417367 -20.96% BenchmarkTwainEncode1e6/better-32 2137499 1554364 -27.28% benchmark old ns/op new ns/op delta BenchmarkRandomEncodeBetterBlock1MB-32 39476 38241 -3.13% BenchmarkEncodeS2Block/0-html/block-better-32 10140 6761 -33.32% BenchmarkEncodeS2Block/1-urls/block-better-32 141170 90141 -36.15% BenchmarkEncodeS2Block/2-jpg/block-better-32 1026 848 -17.35% BenchmarkEncodeS2Block/3-jpg_200b/block-better-32 332 24.3 -92.68% BenchmarkEncodeS2Block/4-pdf/block-better-32 12266 7164 -41.59% BenchmarkEncodeS2Block/5-html4/block-better-32 14229 8134 -42.84% BenchmarkEncodeS2Block/6-txt1/block-better-32 40537 27718 -31.62% BenchmarkEncodeS2Block/7-txt2/block-better-32 35890 24783 -30.95% BenchmarkEncodeS2Block/8-txt3/block-better-32 104525 77463 -25.89% BenchmarkEncodeS2Block/9-txt4/block-better-32 144537 104121 -27.96% BenchmarkEncodeS2Block/10-pb/block-better-32 9017 5427 -39.81% BenchmarkEncodeS2Block/11-gaviota/block-better-32 31386 20973 -33.18% BenchmarkEncodeS2Block/12-txt1_128b/block-better-32 312 16.4 -94.74% BenchmarkEncodeS2Block/13-txt1_1000b/block-better-32 578 136 -76.47% BenchmarkEncodeS2Block/14-txt1_10000b/block-better-32 3278 1293 -60.56% BenchmarkEncodeS2Block/15-txt1_20000b/block-better-32 6469 3820 -40.95% benchmark old MB/s new MB/s speedup BenchmarkRandomEncodeBetterBlock1MB-32 26562.09 27420.04 1.03x BenchmarkEncodeS2Block/0-html/block-better-32 10098.47 15145.41 1.50x BenchmarkEncodeS2Block/1-urls/block-better-32 4973.34 7788.75 1.57x BenchmarkEncodeS2Block/2-jpg/block-better-32 119973.57 145200.76 1.21x BenchmarkEncodeS2Block/3-jpg_200b/block-better-32 602.41 8241.97 13.68x BenchmarkEncodeS2Block/4-pdf/block-better-32 8348.31 14293.26 1.71x BenchmarkEncodeS2Block/5-html4/block-better-32 28786.61 50355.67 1.75x BenchmarkEncodeS2Block/6-txt1/block-better-32 3751.82 5486.93 1.46x BenchmarkEncodeS2Block/7-txt2/block-better-32 3487.81 5051.03 1.45x BenchmarkEncodeS2Block/8-txt3/block-better-32 4082.81 5509.15 1.35x BenchmarkEncodeS2Block/9-txt4/block-better-32 3333.82 4627.90 1.39x BenchmarkEncodeS2Block/10-pb/block-better-32 13151.91 21850.98 1.66x BenchmarkEncodeS2Block/11-gaviota/block-better-32 5872.67 8788.25 1.50x BenchmarkEncodeS2Block/12-txt1_128b/block-better-32 410.38 7791.86 18.99x BenchmarkEncodeS2Block/13-txt1_1000b/block-better-32 1729.19 7370.56 4.26x BenchmarkEncodeS2Block/14-txt1_10000b/block-better-32 3050.66 7736.81 2.54x BenchmarkEncodeS2Block/15-txt1_20000b/block-better-32 3091.47 5235.17 1.69x ``` Streams, With/without assembly, 16 cores: ``` github-june-2days-2019.json: Compressing... 6273951764 -> 949146808 [15.13%]; 564ms, 10608.7MB/s Compressing... 6273951764 -> 950079555 [15.14%]; 722ms, 8287.1MB/s github-ranks-backup.bin: Compressing... 1862623243 -> 555069246 [29.80%]; 261ms, 6805.8MB/s Compressing... 1862623243 -> 555617002 [29.83%]; 384ms, 4625.9MB/s enwik9: Compressing... 1000000000 -> 426854233 [42.69%]; 229ms, 4164.5MB/s Compressing... 1000000000 -> 427660256 [42.77%]; 333ms, 2863.9MB/s nyc-taxi-data-10M.csv: Compressing... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s Compressing... 3325605752 -> 960330423 [28.88%]; 608ms, 5216.4MB/s sharnd.out.2gb: Compressing... 2147483647 -> 2147487753 [100.00%]; 174ms, 11770.0MB/s Compressing... 2147483647 -> 2147487753 [100.00%]; 172ms, 11907.1MB/s ``` --- README.md | 8 +- s2/README.md | 24 +- s2/_generate/gen.go | 691 +++- s2/cmd/s2c/main.go | 67 +- s2/encode_amd64.go | 40 +- s2/encode_better.go | 9 +- s2/encode_go.go | 10 + s2/encodeblock_amd64.go | 42 + s2/encodeblock_amd64.s | 8525 +++++++++++++++++++++++++++++++++------ s2/s2_test.go | 22 +- 10 files changed, 8172 insertions(+), 1266 deletions(-) diff --git a/README.md b/README.md index d9f9249a4e..637ed0ca41 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,13 @@ This package provides various compression algorithms. [![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge) # changelog - +* Feb 25, 2021 (v1.11.8) + * s2: Fixed occational out-of-bounds write on amd64. Upgrade recommended. + * s2: Add AMD64 assembly for better mode. 25-50% faster. [#315](https://github.com/klauspost/compress/pull/315) + * s2: Less upfront decoder allocation. [#322](https://github.com/klauspost/compress/pull/322) + * zstd: Faster "compression" of incompressible data. [#314](https://github.com/klauspost/compress/pull/314) + * zip: Fix zip64 headers. [#313](https://github.com/klauspost/compress/pull/313) + * Jan 14, 2021 (v1.11.7) * Use Bytes() interface to get bytes across packages. [#309](https://github.com/klauspost/compress/pull/309) * s2: Add 'best' compression option. [#310](https://github.com/klauspost/compress/pull/310) diff --git a/s2/README.md b/s2/README.md index 479e824778..601bd397a4 100644 --- a/s2/README.md +++ b/s2/README.md @@ -8,7 +8,7 @@ Decoding is compatible with Snappy compressed content, but content compressed wi This means that S2 can seamlessly replace Snappy without converting compressed content. S2 is designed to have high throughput on content that cannot be compressed. -This is important so you don't have to worry about spending CPU cycles on already compressed data. +This is important, so you don't have to worry about spending CPU cycles on already compressed data. ## Benefits over Snappy @@ -456,33 +456,33 @@ This will compress as much as possible with little regard to CPU usage. Mainly for offline compression, but where decompression speed should still be high and compatible with other S2 compressed data. -Some examples compared on 16 core CPU: +Some examples compared on 16 core CPU, amd64 assembly used: ``` * enwik10 Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s -Better... 10000000000 -> 4225922984 [42.26%]; 2.817s, 3385.4MB/s -Best... 10000000000 -> 3667646858 [36.68%]; 35.995s, 264.9MB/s +Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s +Best... 10000000000 -> 3667646858 [36.68%]; 35.995s, 264.9MB/s * github-june-2days-2019.json Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s -Better... 6273951764 -> 950079555 [15.14%]; 736ms, 8129.5MB/s -Best... 6273951764 -> 846260870 [13.49%]; 8.125s, 736.4MB/s +Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s +Best... 6273951764 -> 846260870 [13.49%]; 8.125s, 736.4MB/s * nyc-taxi-data-10M.csv Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s -Better... 3325605752 -> 960330423 [28.88%]; 602ms, 5268.4MB/s -Best... 3325605752 -> 794873295 [23.90%]; 6.619s, 479.1MB/s +Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s +Best... 3325605752 -> 794873295 [23.90%]; 6.619s, 479.1MB/s * 10gb.tar Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s -Better... 10065157632 -> 5650133605 [56.14%]; 2.172s, 4419.4MB/s -Best... 10065157632 -> 5246578570 [52.13%]; 25.696s, 373.6MB/s +Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s +Best... 10065157632 -> 5246578570 [52.13%]; 25.696s, 373.6MB/s * consensus.db.10gb Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s -Better... 10737418240 -> 4542443833 [42.30%]; 3.3s, 3103.5MB/s -Best... 10737418240 -> 4272335558 [39.79%]; 38.955s, 262.9MB/s +Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s +Best... 10737418240 -> 4272335558 [39.79%]; 38.955s, 262.9MB/s ``` Decompression speed should be around the same as using the 'better' compression mode. diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index f1c345a027..01830e1ff4 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -35,10 +35,17 @@ func main() { snappy: false, } o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B) + o.genEncodeBlockAsm("encodeBlockAsm4MB", 14, 6, 6, 4<<20) o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B) o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B) o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) + // Snappy compatible o.snappy = true o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B) @@ -103,8 +110,8 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") Pragma("noescape") - const literalMaxOverhead = 4 o.maxLen = maxLen + var literalMaxOverhead = maxLitOverheadFor(maxLen) var tableSize = 4 * (1 << tableBits) // Memzero needs at least 128 bytes. @@ -722,6 +729,648 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m RET() } +func maxLitOverheadFor(n int) int { + switch { + case n == 0: + return 0 + case n < 60: + return 1 + case n < 1<<8: + return 2 + case n < 1<<16: + return 3 + case n < 1<<24: + return 4 + } + return 5 +} + +func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) { + TEXT(name, 0, "func(dst, src []byte) int") + Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", + fmt.Sprintf("Maximum input %d bytes.", maxLen), + "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") + Pragma("noescape") + + if lHashBytes > 7 || lHashBytes <= 4 { + panic("lHashBytes must be <= 7 and >4") + } + var literalMaxOverhead = maxLitOverheadFor(maxLen) + + var sTableBits = lTableBits - 2 + const sHashBytes = 4 + o.maxLen = maxLen + + var lTableSize = 4 * (1 << lTableBits) + var sTableSize = 4 * (1 << sTableBits) + + // Memzero needs at least 128 bytes. + if (lTableSize + sTableSize) < 128 { + panic("tableSize must be at least 128 bytes") + } + + lenSrcBasic, err := Param("src").Len().Resolve() + if err != nil { + panic(err) + } + lenSrcQ := lenSrcBasic.Addr + + lenDstBasic, err := Param("dst").Len().Resolve() + if err != nil { + panic(err) + } + lenDstQ := lenDstBasic.Addr + + // Bail if we can't compress to at least this. + dstLimitPtrQ := AllocLocal(8) + + // sLimitL is when to stop looking for offset/length copies. + sLimitL := AllocLocal(4) + + // nextEmitL keeps track of the point we have emitted to. + nextEmitL := AllocLocal(4) + + // Repeat stores the last match offset. + repeatL := AllocLocal(4) + + // nextSTempL keeps nextS while other functions are being called. + nextSTempL := AllocLocal(4) + + // Alloc table last, lTab must be before sTab. + lTab := AllocLocal(lTableSize) + sTab := AllocLocal(sTableSize) + + dst := GP64() + { + dstBaseBasic, err := Param("dst").Base().Resolve() + if err != nil { + panic(err) + } + dstBaseQ := dstBaseBasic.Addr + MOVQ(dstBaseQ, dst) + } + + srcBaseBasic, err := Param("src").Base().Resolve() + if err != nil { + panic(err) + } + srcBaseQ := srcBaseBasic.Addr + + // Zero table + { + iReg := GP64() + MOVQ(U32((sTableSize+lTableSize)/8/16), iReg) + tablePtr := GP64() + LEAQ(lTab, tablePtr) + zeroXmm := XMM() + PXOR(zeroXmm, zeroXmm) + + Label("zero_loop_" + name) + for i := 0; i < 8; i++ { + MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) + } + ADDQ(U8(16*8), tablePtr) + DECQ(iReg) + JNZ(LabelRef("zero_loop_" + name)) + } + + { + // nextEmit is offset n src where the next emitLiteral should start from. + MOVL(U32(0), nextEmitL) + + const inputMargin = 8 + tmp, tmp2, tmp3 := GP64(), GP64(), GP64() + MOVQ(lenSrcQ, tmp) + LEAQ(Mem{Base: tmp, Disp: -6}, tmp2) + // sLimitL := len(src) - inputMargin + LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) + + assert(func(ok LabelRef) { + CMPQ(tmp3, lenSrcQ) + JL(ok) + }) + + MOVL(tmp3.As32(), sLimitL) + + // dstLimit := (len(src) - 5 ) - len(src)>>5 + SHRQ(U8(5), tmp) + SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp + + assert(func(ok LabelRef) { + // if len(src) > len(src) - len(src)>>5 - 5: ok + CMPQ(lenSrcQ, tmp2) + JGE(ok) + }) + + LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2) + MOVQ(tmp2, dstLimitPtrQ) + } + + // s = 1 + s := GP32() + MOVL(U32(1), s) + // repeatL = 1 + MOVL(s, repeatL) + + src := GP64() + Load(Param("src").Base(), src) + + // Load cv + Label("search_loop_" + name) + candidate := GP32() + { + assert(func(ok LabelRef) { + // Check if somebody changed src + tmp := GP64() + MOVQ(srcBaseQ, tmp) + CMPQ(tmp, src) + JEQ(ok) + }) + + cv := GP64() + MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) + nextS := GP32() + // nextS := s + (s-nextEmit)>>skipLog + 1 + { + tmp := GP64() + MOVL(s, tmp.As32()) // tmp = s + SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit + SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog + LEAL(Mem{Base: s, Disp: 1, Index: tmp, Scale: 1}, nextS) + } + // if nextS > sLimit {goto emitRemainder} + { + CMPL(nextS.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + assert(func(ok LabelRef) { + // Check if s is valid (we should have jumped above if not) + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPQ(tmp, s.As64()) + JG(ok) + }) + // move nextS to stack. + MOVL(nextS.As32(), nextSTempL) + + candidateS := GP32() + lHasher := hashN(lHashBytes, lTableBits) + { + sHasher := hashN(sHashBytes, sTableBits) + hash0, hash1 := GP64(), GP64() + MOVQ(cv, hash0) + MOVQ(cv, hash1) + lHasher.hash(hash0) + sHasher.hash(hash1) + MOVL(lTab.Idx(hash0, 4), candidate) + MOVL(sTab.Idx(hash1, 4), candidateS) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + + MOVL(s, lTab.Idx(hash0, 4)) + MOVL(s, sTab.Idx(hash1, 4)) + } + + // En/disable repeat matching. + if true { + // Check repeat at offset checkRep + const checkRep = 1 + { + // rep = s - repeat + rep := GP32() + MOVL(s, rep) + SUBL(repeatL, rep) // rep = s - repeat + + // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + left, right := GP64(), GP64() + MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32()) + MOVQ(cv, left) + SHRQ(U8(checkRep*8), left) + CMPL(left.As32(), right.As32()) + // BAIL, no repeat. + JNE(LabelRef("no_repeat_found_" + name)) + } + // base = s + checkRep + base := GP32() + LEAL(Mem{Base: s, Disp: checkRep}, base) + + // nextEmit before repeat. + nextEmit := GP32() + MOVL(nextEmitL, nextEmit) + + // Extend back + if true { + i := GP32() + MOVL(base, i) + SUBL(repeatL, i) + JZ(LabelRef("repeat_extend_back_end_" + name)) + + Label("repeat_extend_back_loop_" + name) + // if base <= nextemit {exit} + CMPL(base.As32(), nextEmit) + JLE(LabelRef("repeat_extend_back_end_" + name)) + // if src[i-1] == src[base-1] + tmp, tmp2 := GP64(), GP64() + MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8()) + MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) + CMPB(tmp.As8(), tmp2.As8()) + JNE(LabelRef("repeat_extend_back_end_" + name)) + LEAL(Mem{Base: base, Disp: -1}, base) + DECL(i) + JNZ(LabelRef("repeat_extend_back_loop_" + name)) + } + Label("repeat_extend_back_end_" + name) + + // Base is now at start. Emit until base. + // d += emitLiteral(dst[d:], src[nextEmit:base]) + if true { + o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name) + } + + // Extend forward + { + // s += 4 + checkRep + ADDL(U8(4+checkRep), s) + + if true { + // candidate := s - repeat + 4 + checkRep + MOVL(s, candidate) + SUBL(repeatL, candidate) // candidate = s - repeat + + // srcLeft = len(src) - s + srcLeft := GP64() + MOVQ(lenSrcQ, srcLeft) + SUBL(s, srcLeft.As32()) + assert(func(ok LabelRef) { + // if srcleft < maxint32: ok + CMPQ(srcLeft, U32(0x7fffffff)) + JL(ok) + }) + // Forward address + forwardStart := GP64() + LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart) + // End address + backStart := GP64() + LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart) + + length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) + forwardStart, backStart, srcLeft = nil, nil, nil + Label("repeat_extend_forward_end_" + name) + // s+= length + ADDL(length.As32(), s) + } + } + // Emit + if true { + // length = s-base + length := GP32() + MOVL(s, length) + SUBL(base.As32(), length) // length = s - base + + offsetVal := GP32() + MOVL(repeatL, offsetVal) + + if !o.snappy { + // if nextEmit == 0 {do copy instead...} + TESTL(nextEmit, nextEmit) + JZ(LabelRef("repeat_as_copy_" + name)) + + // Emit as repeat... + o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + + // Emit as copy instead... + Label("repeat_as_copy_" + name) + } + o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + + Label("repeat_end_emit_" + name) + // Store new dst and nextEmit + MOVL(s, nextEmitL) + } + // if s >= sLimit is picked up on next loop. + if false { + CMPL(s.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + JMP(LabelRef("search_loop_" + name)) + } + Label("no_repeat_found_" + name) + { + // Check candidates are ok. All must be < s and < len(src) + assert(func(ok LabelRef) { + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPL(tmp.As32(), candidate) + JG(ok) + }) + assert(func(ok LabelRef) { + CMPL(s, candidate) + JG(ok) + }) + assert(func(ok LabelRef) { + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPL(tmp.As32(), candidateS) + JG(ok) + }) + assert(func(ok LabelRef) { + CMPL(s, candidateS) + JG(ok) + }) + + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidate_match_" + name)) + + //if uint32(cv) == load32(src, candidateS) + CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidateS_match_" + name)) + + // No match found, next loop + // s = nextS + MOVL(nextSTempL, s) + JMP(LabelRef("search_loop_" + name)) + + // Short match at s, try a long candidate at s+1 + Label("candidateS_match_" + name) + if true { + hash0 := GP64() + SHRQ(U8(8), cv) + MOVQ(cv, hash0) + lHasher.hash(hash0) + MOVL(lTab.Idx(hash0, 4), candidate) + INCL(s) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + MOVL(s, lTab.Idx(hash0, 4)) + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidate_match_" + name)) + // No match, decrement s again and use short match at s... + DECL(s) + } + MOVL(candidateS, candidate) + } + } + + Label("candidate_match_" + name) + // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. + // Extend backwards + if true { + ne := GP32() + MOVL(nextEmitL, ne) + TESTL(candidate, candidate) + JZ(LabelRef("match_extend_back_end_" + name)) + + // candidate is tested when decremented, so we loop back here. + Label("match_extend_back_loop_" + name) + // if s <= nextEmit {exit} + CMPL(s, ne) + JLE(LabelRef("match_extend_back_end_" + name)) + // if src[candidate-1] == src[s-1] + tmp, tmp2 := GP64(), GP64() + MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) + MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) + CMPB(tmp.As8(), tmp2.As8()) + JNE(LabelRef("match_extend_back_end_" + name)) + LEAL(Mem{Base: s, Disp: -1}, s) + DECL(candidate) + JZ(LabelRef("match_extend_back_end_" + name)) + JMP(LabelRef("match_extend_back_loop_" + name)) + } + Label("match_extend_back_end_" + name) + + // Bail if we exceed the maximum size. + if true { + // tmp = s-nextEmit + tmp := GP64() + MOVL(s, tmp.As32()) + SUBL(nextEmitL, tmp.As32()) + // tmp = &dst + s-nextEmit + LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp) + CMPQ(tmp, dstLimitPtrQ) + JL(LabelRef("match_dst_size_check_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + } + Label("match_dst_size_check_" + name) + + base := GP32() + MOVL(s, base.As32()) + + // s+=4, candidate+=4 + ADDL(U8(4), s) + ADDL(U8(4), candidate) + // Extend the 4-byte match as long as possible and emit copy. + { + assert(func(ok LabelRef) { + // s must be > candidate cannot be equal. + CMPL(s, candidate) + JG(ok) + }) + // srcLeft = len(src) - s + srcLeft := GP64() + MOVQ(lenSrcQ, srcLeft) + SUBL(s, srcLeft.As32()) + assert(func(ok LabelRef) { + // if srcleft < maxint32: ok + CMPQ(srcLeft, U32(0x7fffffff)) + JL(ok) + }) + + a, b := GP64(), GP64() + LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) + LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) + length := o.matchLen("match_nolit_"+name, + a, b, + srcLeft, + LabelRef("match_nolit_end_"+name), + ) + Label("match_nolit_end_" + name) + assert(func(ok LabelRef) { + CMPL(length.As32(), U32(math.MaxInt32)) + JL(ok) + }) + a, b, srcLeft = nil, nil, nil + + // Update repeat + { + // repeat = base - candidate + repeatVal := GP64().As32() + MOVL(s, repeatVal) + SUBL(candidate, repeatVal) + // Check if match is better.. + if o.maxLen > 65535 { + CMPL(length.As32(), U8(1)) + JG(LabelRef("match_length_ok_" + name)) + CMPL(repeatVal, U32(65535)) + JLE(LabelRef("match_length_ok_" + name)) + // Match is equal or worse to the encoding. + MOVL(nextSTempL, s) + INCL(s) + JMP(LabelRef("search_loop_" + name)) + Label("match_length_ok_" + name) + } + // Store updated repeat + MOVL(repeatVal, repeatL) + } + // Emit.... + o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) + // s += length (length is destroyed, use it now) + ADDL(length.As32(), s) + + // Load offset from repeat value. + offset := GP64() + MOVL(repeatL, offset.As32()) + + // length += 4 + ADDL(U8(4), length.As32()) + MOVL(s, nextEmitL) // nextEmit = s + o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) + Label("match_nolit_emitcopy_end_" + name) + + // if s >= sLimit { end } + { + CMPL(s.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + + // Bail if we exceed the maximum size. + { + CMPQ(dst, dstLimitPtrQ) + JL(LabelRef("match_nolit_dst_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + } + } + Label("match_nolit_dst_ok_" + name) + // cv must be set to value at base+1 before arriving here + if true { + lHasher := hashN(lHashBytes, lTableBits) + sHasher := hashN(sHashBytes, sTableBits) + + // Index base+1 long, base+2 short... + cv := GP64() + INCL(base) + MOVQ(Mem{Base: src, Index: base, Scale: 1, Disp: 0}, cv) + hash0, hash1 := GP64(), GP64() + MOVQ(cv, hash0) // src[base+1] + MOVQ(cv, hash1) + SHRQ(U8(8), hash1) // src[base+2] + bp1 := GP32() // base+1 + LEAL(Mem{Base: base, Disp: 1}, bp1) + + // Load s-2 early + MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) + + lHasher.hash(hash0) + sHasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + MOVL(base, lTab.Idx(hash0, 4)) + MOVL(bp1, sTab.Idx(hash1, 4)) + + // Index s-2 long, s-1 short... + MOVQ(cv, hash0) // src[s-2] + MOVQ(cv, hash1) // src[s-1] + SHRQ(U8(8), hash1) + sm1, sm2 := GP32(), GP32() // s -1, s - 2 + LEAL(Mem{Base: s, Disp: -2}, sm2) + LEAL(Mem{Base: s, Disp: -1}, sm1) + lHasher.hash(hash0) + sHasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + MOVL(sm2, lTab.Idx(hash0, 4)) + MOVL(sm1, sTab.Idx(hash1, 4)) + } + JMP(LabelRef("search_loop_" + name)) + + Label("emit_remainder_" + name) + // Bail if we exceed the maximum size. + // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 + { + // remain = len(src) - nextEmit + remain := GP64() + MOVQ(lenSrcQ, remain) + SUBL(nextEmitL, remain.As32()) + + dstExpect := GP64() + // dst := dst + (len(src)-nextEmitL) + + LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect) + CMPQ(dstExpect, dstLimitPtrQ) + JL(LabelRef("emit_remainder_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + Label("emit_remainder_ok_" + name) + } + // emitLiteral(dst[d:], src[nextEmitL:]) + emitEnd := GP64() + MOVQ(lenSrcQ, emitEnd) + + // Emit final literals. + o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name) + + // Assert size is < limit + assert(func(ok LabelRef) { + // if dstBaseQ < dstLimitPtrQ: ok + CMPQ(dst, dstLimitPtrQ) + JL(ok) + }) + + // length := start - base (ptr arithmetic) + length := GP64() + dstBase := Load(Param("dst").Base(), GP64()) + MOVQ(dst, length) + SUBQ(dstBase, length) + + // Assert size is < len(src) + assert(func(ok LabelRef) { + // if len(src) >= length: ok + CMPQ(lenSrcQ, length) + JGE(ok) + }) + // Assert size is < len(dst) + assert(func(ok LabelRef) { + // if len(dst) >= length: ok + CMPQ(lenDstQ, length) + JGE(ok) + }) + Store(length, ReturnIndex(0)) + RET() +} + // emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. // Checks if base == nextemit. // src & base are untouched. @@ -818,7 +1467,9 @@ func hashN(hashBytes, tablebits int) hashGen { // hash uses multiply to get hash of the value. func (h hashGen) hash(val reg.GPVirtual) { // Move value to top of register. - SHLQ(U8(64-8*h.bytes), val) + if h.bytes < 8 { + SHLQ(U8(64-8*h.bytes), val) + } IMULQ(h.mulreg, val) // Move value to bottom SHRQ(U8(64-h.tablebits), val) @@ -965,7 +1616,7 @@ func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.G } JMP(end) - // > 32 bytes + // > 64 bytes Label("memmove_long_" + name) // copy(dst[i:], lit) @@ -1021,6 +1672,7 @@ func (o options) genEmitRepeat() { // Will jump to end label when finished. // Uses 1 GP register. func (o options) emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { + Comment("emitRepeat") Label("emit_repeat_again_" + name) tmp := GP32() MOVL(length.As32(), tmp) // Copy length @@ -1212,6 +1864,8 @@ const ( // Will jump to end label when finished. // Uses 2 GP registers. func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { + Comment("emitCopy") + if o.maxLen >= 65536 { //if offset >= 65536 { CMPL(offset.As32(), U32(65536)) @@ -1357,6 +2011,7 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir // All passed registers may be updated. // Length must be 1 -> 64 bytes func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, end LabelRef) { + Comment("genMemMoveShort") AX, CX := GP64(), GP64() name += "_memmove_" @@ -1448,6 +2103,7 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en // AVX uses 4 GP registers 16 AVX/SSE registers. // All passed registers may be updated. func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) { + Comment("genMemMoveLong") name += "large_" assert(func(ok LabelRef) { @@ -1622,8 +2278,9 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end // Store start and end for sse_tail Label(name + "forward_sse") - X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() - X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() + X0, X1, X2, X3, X4, X5 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + // X6, X7 := XMM(), XMM() + //X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() MOVOU(Mem{Base: src}, X0) MOVOU(Mem{Base: src, Disp: 16}, X1) @@ -1634,7 +2291,7 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end dstAlign := GP64() bigLoops := GP64() MOVQ(length, bigLoops) - SHRQ(U8(7), bigLoops) // bigLoops = length / 128 + SHRQ(U8(5), bigLoops) // bigLoops = length / 32 MOVQ(dst, dstAlign) ANDL(U32(31), dstAlign.As32()) @@ -1642,7 +2299,7 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end MOVQ(U32(64), srcOff) SUBQ(dstAlign, srcOff) - // Move 128 bytes/loop + // Move 32 bytes/loop DECQ(bigLoops) JA(LabelRef(name + "forward_sse_loop_32")) @@ -1656,24 +2313,12 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end MOVOU(Mem{Disp: 0, Base: srcPos}, X4) MOVOU(Mem{Disp: 16, Base: srcPos}, X5) - MOVOU(Mem{Disp: 32, Base: srcPos}, X6) - MOVOU(Mem{Disp: 48, Base: srcPos}, X7) - MOVOU(Mem{Disp: 64, Base: srcPos}, X8) - MOVOU(Mem{Disp: 80, Base: srcPos}, X9) - MOVOU(Mem{Disp: 96, Base: srcPos}, X10) - MOVOU(Mem{Disp: 112, Base: srcPos}, X11) MOVOA(X4, Mem{Disp: 0, Base: dstPos}) MOVOA(X5, Mem{Disp: 16, Base: dstPos}) - MOVOA(X6, Mem{Disp: 32, Base: dstPos}) - MOVOA(X7, Mem{Disp: 48, Base: dstPos}) - MOVOA(X8, Mem{Disp: 64, Base: dstPos}) - MOVOA(X9, Mem{Disp: 80, Base: dstPos}) - MOVOA(X10, Mem{Disp: 96, Base: dstPos}) - MOVOA(X11, Mem{Disp: 112, Base: dstPos}) - ADDQ(U8(128), dstPos) - ADDQ(U8(128), srcPos) - ADDQ(U8(128), srcOff) // This could be outside the loop, but we lose a reg if we do. + ADDQ(U8(32), dstPos) + ADDQ(U8(32), srcPos) + ADDQ(U8(32), srcOff) // This could be outside the loop, but we lose a reg if we do. DECQ(bigLoops) JNA(LabelRef(name + "big_loop_back")) @@ -1720,6 +2365,7 @@ func (o options) genMatchLen() { // Will jump to end when done and returns the length. // Uses 2 GP registers. func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { + Comment("matchLen") if false { return o.matchLenAlt(name, a, b, len, end) } @@ -1768,6 +2414,7 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re // Uses 3 GP registers. // It is better on longer matches. func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { + Comment("matchLenAlt") tmp, tmp2, matched := GP64(), GP64(), GP32() XORL(matched, matched) diff --git a/s2/cmd/s2c/main.go b/s2/cmd/s2c/main.go index 7bcef824d9..64483fbbb9 100644 --- a/s2/cmd/s2c/main.go +++ b/s2/cmd/s2c/main.go @@ -2,6 +2,7 @@ package main import ( "bufio" + "bytes" "errors" "flag" "fmt" @@ -36,6 +37,7 @@ var ( remove = flag.Bool("rm", false, "Delete source file(s) after successful compression") quiet = flag.Bool("q", false, "Don't write any output to terminal, except errors") bench = flag.Int("bench", 0, "Run benchmark n times. No output will be written") + verify = flag.Bool("verify", false, "Verify written files") help = flag.Bool("help", false, "Display help") cpuprofile, memprofile, traceprofile string @@ -137,6 +139,7 @@ Options:`) *quiet = *quiet || *stdout if *bench > 0 { debug.SetGCPercent(10) + dec := s2.NewReader(nil) for _, filename := range files { func() { if !*quiet { @@ -151,8 +154,18 @@ Options:`) _, err = io.ReadFull(file, b) exitErr(err) file.Close() + var buf *bytes.Buffer for i := 0; i < *bench; i++ { - wc := wCounter{out: ioutil.Discard} + w := ioutil.Discard + // Verify with this buffer... + if *verify { + if buf == nil { + buf = bytes.NewBuffer(make([]byte, 0, len(b)+(len(b)>>8))) + } + buf.Reset() + w = buf + } + wc := wCounter{out: w} if !*quiet { fmt.Print("\nCompressing...") } @@ -170,6 +183,27 @@ Options:`) ms := elapsed.Round(time.Millisecond) fmt.Printf(" %d -> %d [%.02f%%]; %v, %.01fMB/s", input, wc.n, pct, ms, mbpersec) } + if *verify { + if !*quiet { + fmt.Print("\nDecompressing.") + } + start := time.Now() + dec.Reset(buf) + n, err := io.Copy(ioutil.Discard, dec) + exitErr(err) + if int(n) != len(b) { + exitErr(fmt.Errorf("unexpected size, want %d, got %d", len(b), n)) + } + if !*quiet { + input := len(b) + elapsed := time.Since(start) + mbpersec := (float64(input) / (1024 * 1024)) / (float64(elapsed) / (float64(time.Second))) + pct := float64(input) * 100 / float64(wc.n) + ms := elapsed.Round(time.Millisecond) + fmt.Printf(" %d -> %d [%.02f%%]; %v, %.01fMB/s", wc.n, n, pct, ms, mbpersec) + } + dec.Reset(nil) + } } fmt.Println("") wr.Close() @@ -218,6 +252,7 @@ Options:`) defer bw.Flush() out = bw } + out, errFn := verifyTo(ioutil.Discard) wc := wCounter{out: out} wr.Reset(&wc) defer wr.Close() @@ -232,6 +267,7 @@ Options:`) pct := float64(wc.n) * 100 / float64(input) fmt.Printf(" %d -> %d [%.02f%%]; %.01fMB/s\n", input, wc.n, pct, mbpersec) } + exitErr(errFn()) if *remove { closeOnce.Do(func() { file.Close() @@ -246,6 +282,35 @@ Options:`) } } +func verifyTo(w io.Writer) (io.Writer, func() error) { + if !*verify { + return w, func() error { + return nil + } + } + pr, pw := io.Pipe() + writer := io.MultiWriter(w, pw) + var wg sync.WaitGroup + var err error + wg.Add(1) + go func() { + defer wg.Done() + r := s2.NewReader(pr) + _, err = io.Copy(ioutil.Discard, r) + pr.CloseWithError(fmt.Errorf("verify: %w", err)) + }() + return writer, func() error { + pw.Close() + wg.Wait() + if err == nil { + if !*quiet { + fmt.Print("... Verified ok.") + } + } + return err + } +} + func printErr(err error) { if err != nil { fmt.Fprintln(os.Stderr, "\nERROR:", err.Error()) diff --git a/s2/encode_amd64.go b/s2/encode_amd64.go index 253f84f3c1..e5b47a7a03 100644 --- a/s2/encode_amd64.go +++ b/s2/encode_amd64.go @@ -21,9 +21,12 @@ func encodeBlock(dst, src []byte) (d int) { limit8B = 512 ) - if len(src) >= limit12B { + if len(src) >= 4<<20 { return encodeBlockAsm(dst, src) } + if len(src) >= limit12B { + return encodeBlockAsm4MB(dst, src) + } if len(src) >= limit10B { return encodeBlockAsm12B(dst, src) } @@ -36,6 +39,41 @@ func encodeBlock(dst, src []byte) (d int) { return encodeBlockAsm8B(dst, src) } +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetter(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + + if len(src) > 4<<20 { + return encodeBetterBlockAsm(dst, src) + } + if len(src) >= limit12B { + return encodeBetterBlockAsm4MB(dst, src) + } + if len(src) >= limit10B { + return encodeBetterBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeBetterBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBetterBlockAsm8B(dst, src) +} + // encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. diff --git a/s2/encode_better.go b/s2/encode_better.go index f4c5e04d2f..13e7d4eada 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -44,7 +44,7 @@ func hash8(u uint64, h uint8) uint32 { // It also assumes that: // len(dst) >= MaxEncodedLen(len(src)) && // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBetter(dst, src []byte) (d int) { +func encodeBlockBetterGo(dst, src []byte) (d int) { // Initialize the hash tables. const ( // Long hash matches. @@ -68,7 +68,7 @@ func encodeBlockBetter(dst, src []byte) (d int) { } // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 5 + dstLimit := len(src) - len(src)>>5 - 6 // nextEmit is where in src the next emitLiteral should start from. nextEmit := 0 @@ -83,9 +83,10 @@ func encodeBlockBetter(dst, src []byte) (d int) { for { candidateL := 0 + nextS := 0 for { // Next src position to check - nextS := s + (s-nextEmit)>>7 + 1 + nextS = s + (s-nextEmit)>>7 + 1 if nextS > sLimit { goto emitRemainder } @@ -185,7 +186,7 @@ func encodeBlockBetter(dst, src []byte) (d int) { if offset > 65535 && s-base <= 5 { // Bail if the match is equal or worse to the encoding. - s = base + 3 + s = nextS + 1 if s >= sLimit { goto emitRemainder } diff --git a/s2/encode_go.go b/s2/encode_go.go index 82f0047df6..8be2b8f86f 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -20,6 +20,16 @@ func encodeBlock(dst, src []byte) (d int) { return encodeBlockGo(dst, src) } +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockBetter(dst, src []byte) (d int) { + return encodeBlockBetterGo(dst, src) +} + // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index 99e7d68bee..9ab3c7ae74 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -13,6 +13,13 @@ package s2 //go:noescape func encodeBlockAsm(dst []byte, src []byte) int +// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4194304 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm4MB(dst []byte, src []byte) int + // encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 16383 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. @@ -34,6 +41,41 @@ func encodeBlockAsm10B(dst []byte, src []byte) int //go:noescape func encodeBlockAsm8B(dst []byte, src []byte) int +// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm(dst []byte, src []byte) int + +// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4194304 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm4MB(dst []byte, src []byte) int + +// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm12B(dst []byte, src []byte) int + +// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm10B(dst []byte, src []byte) int + +// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm8B(dst []byte, src []byte) int + // encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4294967295 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 239c1c7de1..918f57f9e8 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -144,6 +144,8 @@ one_byte_repeat_emit_encodeBlockAsm: memmove_repeat_emit_encodeBlockAsm: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3 @@ -205,13 +207,15 @@ memmove_end_copy_repeat_emit_encodeBlockAsm: JMP emit_literal_done_repeat_emit_encodeBlockAsm memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -224,23 +228,11 @@ memmove_long_repeat_emit_encodeBlockAsm: emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back @@ -266,6 +258,8 @@ emit_literal_done_repeat_emit_encodeBlockAsm: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm @@ -306,6 +300,7 @@ repeat_extend_forward_end_encodeBlockAsm: TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm + // emitRepeat emit_repeat_again_match_repeat_encodeBlockAsm: MOVL BP, DI LEAL -4(BP), BP @@ -373,6 +368,7 @@ repeat_two_offset_match_repeat_encodeBlockAsm: JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: + // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm @@ -386,6 +382,7 @@ four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm + // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: MOVL BP, DI LEAL -4(BP), BP @@ -471,6 +468,7 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm: LEAL -60(BP), BP ADDQ $0x03, AX + // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: MOVL BP, DI LEAL -4(BP), BP @@ -608,7 +606,7 @@ match_extend_back_loop_encodeBlockAsm: match_extend_back_end_encodeBlockAsm: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -667,6 +665,8 @@ one_byte_match_emit_encodeBlockAsm: memmove_match_emit_encodeBlockAsm: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3 @@ -728,13 +728,15 @@ memmove_end_copy_match_emit_encodeBlockAsm: JMP emit_literal_done_match_emit_encodeBlockAsm memmove_long_match_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -747,23 +749,11 @@ memmove_long_match_emit_encodeBlockAsm: emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back @@ -792,6 +782,8 @@ match_nolit_loop_encodeBlockAsm: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm @@ -829,6 +821,8 @@ match_nolit_end_encodeBlockAsm: MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) + + // emitCopy CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm @@ -842,6 +836,7 @@ four_bytes_loop_back_match_nolit_encodeBlockAsm: CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm + // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: MOVL R9, SI LEAL -4(R9), R9 @@ -927,6 +922,7 @@ two_byte_offset_match_nolit_encodeBlockAsm: LEAL -60(R9), R9 ADDQ $0x03, AX + // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: MOVL R9, SI LEAL -4(R9), R9 @@ -1049,7 +1045,7 @@ match_nolit_dst_ok_encodeBlockAsm: emit_remainder_encodeBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -1109,6 +1105,8 @@ one_byte_emit_remainder_encodeBlockAsm: memmove_emit_remainder_encodeBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 @@ -1170,14 +1168,16 @@ memmove_end_copy_emit_remainder_encodeBlockAsm: JMP emit_literal_done_emit_remainder_encodeBlockAsm memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -1190,23 +1190,11 @@ memmove_long_emit_remainder_encodeBlockAsm: emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back @@ -1230,15 +1218,15 @@ emit_literal_done_emit_remainder_encodeBlockAsm: MOVQ AX, ret+48(FP) RET -// func encodeBlockAsm12B(dst []byte, src []byte) int +// func encodeBlockAsm4MB(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBlockAsm12B(SB), $16408-56 +TEXT ·encodeBlockAsm4MB(SB), $65560-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX + MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBlockAsm12B: +zero_loop_encodeBlockAsm4MB: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -1249,7 +1237,7 @@ zero_loop_encodeBlockAsm12B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBlockAsm12B + JNZ zero_loop_encodeBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -1263,25 +1251,25 @@ zero_loop_encodeBlockAsm12B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBlockAsm12B: +search_loop_encodeBlockAsm4MB: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x05, BP + SHRL $0x06, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBlockAsm12B + JGE emit_remainder_encodeBlockAsm4MB MOVL BP, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 - SHLQ $0x18, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x18, R10 + SHRQ $0x32, R9 + SHLQ $0x10, R10 IMULQ R8, R10 - SHRQ $0x34, R10 + SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BP MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) @@ -1289,113 +1277,126 @@ search_loop_encodeBlockAsm12B: MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 - SHLQ $0x18, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x34, R9 + SHRQ $0x32, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm12B + JNE no_repeat_found_encodeBlockAsm4MB LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBlockAsm12B + JZ repeat_extend_back_end_encodeBlockAsm4MB -repeat_extend_back_loop_encodeBlockAsm12B: +repeat_extend_back_loop_encodeBlockAsm4MB: CMPL SI, DI - JLE repeat_extend_back_end_encodeBlockAsm12B + JLE repeat_extend_back_end_encodeBlockAsm4MB MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBlockAsm12B + JNE repeat_extend_back_end_encodeBlockAsm4MB LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBlockAsm12B + JNZ repeat_extend_back_loop_encodeBlockAsm4MB -repeat_extend_back_end_encodeBlockAsm12B: +repeat_extend_back_end_encodeBlockAsm4MB: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B + JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm12B + JLT one_byte_repeat_emit_encodeBlockAsm4MB CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm12B + JLT two_bytes_repeat_emit_encodeBlockAsm4MB + CMPL BP, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm4MB + MOVL BP, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +three_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm12B + JMP memmove_long_repeat_emit_encodeBlockAsm4MB -two_bytes_repeat_emit_encodeBlockAsm12B: +two_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBlockAsm12B - JMP memmove_long_repeat_emit_encodeBlockAsm12B + JL memmove_repeat_emit_encodeBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBlockAsm4MB -one_byte_repeat_emit_encodeBlockAsm12B: +one_byte_repeat_emit_encodeBlockAsm4MB: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBlockAsm12B: +memmove_repeat_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -1405,65 +1406,55 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBlockAsm12B: +memmove_end_copy_repeat_emit_encodeBlockAsm4MB: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB -memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), BP +memmove_long_repeat_emit_encodeBlockAsm4MB: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBlockAsm12B: +emit_literal_done_repeat_emit_encodeBlockAsm4MB: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -1471,78 +1462,94 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm12B + JL matchlen_single_repeat_extend_encodeBlockAsm4MB -matchlen_loopback_repeat_extend_encodeBlockAsm12B: +matchlen_loopback_repeat_extend_encodeBlockAsm4MB: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm12B + JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm12B + JMP repeat_extend_forward_end_encodeBlockAsm4MB -matchlen_loop_repeat_extend_encodeBlockAsm12B: +matchlen_loop_repeat_extend_encodeBlockAsm4MB: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B + JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB -matchlen_single_repeat_extend_encodeBlockAsm12B: +matchlen_single_repeat_extend_encodeBlockAsm4MB: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBlockAsm12B + JZ repeat_extend_forward_end_encodeBlockAsm4MB -matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: +matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm12B + JNE repeat_extend_forward_end_encodeBlockAsm4MB LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB -repeat_extend_forward_end_encodeBlockAsm12B: +repeat_extend_forward_end_encodeBlockAsm4MB: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm12B - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm12B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm12B + JZ repeat_as_copy_encodeBlockAsm4MB -cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm4MB + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB + +cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm12B + JLT repeat_three_match_repeat_encodeBlockAsm4MB + CMPL BP, $0x00010100 + JLT repeat_four_match_repeat_encodeBlockAsm4MB + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_match_repeat_encodeBlockAsm4MB: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_three_match_repeat_encodeBlockAsm12B: +repeat_three_match_repeat_encodeBlockAsm4MB: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_match_repeat_encodeBlockAsm12B: +repeat_two_match_repeat_encodeBlockAsm4MB: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_offset_match_repeat_encodeBlockAsm12B: +repeat_two_offset_match_repeat_encodeBlockAsm4MB: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -1551,49 +1558,144 @@ repeat_two_offset_match_repeat_encodeBlockAsm12B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_as_copy_encodeBlockAsm12B: -two_byte_offset_repeat_as_copy_encodeBlockAsm12B: +repeat_as_copy_encodeBlockAsm4MB: + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: + TESTL BP, BP + JZ repeat_end_emit_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX + + // emitRepeat MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -1602,14 +1704,14 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB -two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: +two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -1618,150 +1720,163 @@ two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -emit_copy_three_repeat_as_copy_encodeBlockAsm12B: +emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBlockAsm12B: +repeat_end_emit_encodeBlockAsm4MB: MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm4MB -no_repeat_found_encodeBlockAsm12B: +no_repeat_found_encodeBlockAsm4MB: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBlockAsm12B + JEQ candidate_match_encodeBlockAsm4MB SHRQ $0x08, SI MOVL 24(SP)(R9*4), BP LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm12B + JEQ candidate2_match_encodeBlockAsm4MB MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeBlockAsm12B + JEQ candidate3_match_encodeBlockAsm4MB MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm4MB -candidate3_match_encodeBlockAsm12B: +candidate3_match_encodeBlockAsm4MB: ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm12B + JMP candidate_match_encodeBlockAsm4MB -candidate2_match_encodeBlockAsm12B: +candidate2_match_encodeBlockAsm4MB: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BP -candidate_match_encodeBlockAsm12B: +candidate_match_encodeBlockAsm4MB: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBlockAsm12B + JZ match_extend_back_end_encodeBlockAsm4MB -match_extend_back_loop_encodeBlockAsm12B: +match_extend_back_loop_encodeBlockAsm4MB: CMPL CX, SI - JLE match_extend_back_end_encodeBlockAsm12B + JLE match_extend_back_end_encodeBlockAsm4MB MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBlockAsm12B + JNE match_extend_back_end_encodeBlockAsm4MB LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBlockAsm12B - JMP match_extend_back_loop_encodeBlockAsm12B + JZ match_extend_back_end_encodeBlockAsm4MB + JMP match_extend_back_loop_encodeBlockAsm4MB -match_extend_back_end_encodeBlockAsm12B: +match_extend_back_end_encodeBlockAsm4MB: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBlockAsm12B + JL match_dst_size_check_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBlockAsm12B: +match_dst_size_check_encodeBlockAsm4MB: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm12B + JEQ emit_literal_done_match_emit_encodeBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm12B + JLT one_byte_match_emit_encodeBlockAsm4MB CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm12B + JLT two_bytes_match_emit_encodeBlockAsm4MB + CMPL DI, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm4MB + MOVL DI, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW DI, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBlockAsm4MB + +three_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm12B + JMP memmove_long_match_emit_encodeBlockAsm4MB -two_bytes_match_emit_encodeBlockAsm12B: +two_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBlockAsm12B - JMP memmove_long_match_emit_encodeBlockAsm12B + JL memmove_match_emit_encodeBlockAsm4MB + JMP memmove_long_match_emit_encodeBlockAsm4MB -one_byte_match_emit_encodeBlockAsm12B: +one_byte_match_emit_encodeBlockAsm4MB: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBlockAsm12B: +memmove_match_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -1771,66 +1886,56 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBlockAsm12B: +memmove_end_copy_match_emit_encodeBlockAsm4MB: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm12B + JMP emit_literal_done_match_emit_encodeBlockAsm4MB -memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeBlockAsm4MB: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBlockAsm12B: -match_nolit_loop_encodeBlockAsm12B: +emit_literal_done_match_emit_encodeBlockAsm4MB: +match_nolit_loop_encodeBlockAsm4MB: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -1840,84 +1945,181 @@ match_nolit_loop_encodeBlockAsm12B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm12B + JL matchlen_single_match_nolit_encodeBlockAsm4MB -matchlen_loopback_match_nolit_encodeBlockAsm12B: +matchlen_loopback_match_nolit_encodeBlockAsm4MB: MOVQ (DI)(R9*1), R8 XORQ (BP)(R9*1), R8 TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm12B + JZ matchlen_loop_match_nolit_encodeBlockAsm4MB BSFQ R8, R8 SARQ $0x03, R8 LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm12B + JMP match_nolit_end_encodeBlockAsm4MB -matchlen_loop_match_nolit_encodeBlockAsm12B: +matchlen_loop_match_nolit_encodeBlockAsm4MB: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm12B + JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB -matchlen_single_match_nolit_encodeBlockAsm12B: +matchlen_single_match_nolit_encodeBlockAsm4MB: TESTL SI, SI - JZ match_nolit_end_encodeBlockAsm12B + JZ match_nolit_end_encodeBlockAsm4MB -matchlen_single_loopback_match_nolit_encodeBlockAsm12B: +matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: MOVB (DI)(R9*1), R8 CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm12B + JNE match_nolit_end_encodeBlockAsm4MB LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB -match_nolit_end_encodeBlockAsm12B: +match_nolit_end_encodeBlockAsm4MB: ADDL R9, CX MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeBlockAsm12B: + // emitCopy + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL BP, 1(AX) + LEAL -64(R9), R9 + ADDQ $0x05, AX + CMPL R9, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm4MB + + // emitRepeat + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + CMPL R9, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL R9, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy + LEAL -65536(R9), R9 + MOVL R9, BP + MOVW $0x001d, (AX) + MOVW R9, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -256(R9), R9 + MOVW $0x0019, (AX) + MOVW R9, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -4(R9), R9 + MOVW $0x0015, (AX) + MOVB R9, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBlockAsm4MB: + TESTL R9, R9 + JZ match_nolit_emitcopy_end_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +two_byte_offset_match_nolit_encodeBlockAsm4MB: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB MOVB $0xee, (AX) MOVW BP, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX + + // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL BP, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: CMPL R9, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL R9, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(R9), R9 + MOVL R9, BP + MOVW $0x001d, (AX) + MOVW R9, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) @@ -1926,14 +2128,14 @@ repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBlockAsm4MB -two_byte_offset_short_match_nolit_encodeBlockAsm12B: +two_byte_offset_short_match_nolit_encodeBlockAsm4MB: CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm12B + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm12B + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB MOVB $0x01, BL LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) @@ -1942,136 +2144,149 @@ two_byte_offset_short_match_nolit_encodeBlockAsm12B: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -emit_copy_three_match_nolit_encodeBlockAsm12B: +emit_copy_three_match_nolit_encodeBlockAsm4MB: MOVB $0x02, BL LEAL -4(BX)(R9*4), R9 MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBlockAsm12B: +match_nolit_emitcopy_end_encodeBlockAsm4MB: CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm12B + JGE emit_remainder_encodeBlockAsm4MB MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm12B + JL match_nolit_dst_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 +match_nolit_dst_ok_encodeBlockAsm4MB: + MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BP - SHLQ $0x18, DI + SHLQ $0x10, DI IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BP + SHRQ $0x32, DI + SHLQ $0x10, BP IMULQ R8, BP - SHRQ $0x34, BP + SHRQ $0x32, BP LEAL -2(CX), R8 LEAQ 24(SP)(BP*4), R9 MOVL (R9), BP MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeBlockAsm12B + JEQ match_nolit_loop_encodeBlockAsm4MB INCL CX - JMP search_loop_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm4MB -emit_remainder_encodeBlockAsm12B: +emit_remainder_encodeBlockAsm4MB: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm12B + JL emit_remainder_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBlockAsm12B: +emit_remainder_ok_encodeBlockAsm4MB: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B + JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm12B + JLT one_byte_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm12B + JLT two_bytes_emit_remainder_encodeBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +three_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm12B + JMP memmove_long_emit_remainder_encodeBlockAsm4MB -two_bytes_emit_remainder_encodeBlockAsm12B: +two_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm12B - JMP memmove_long_emit_remainder_encodeBlockAsm12B + JL memmove_emit_remainder_encodeBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBlockAsm4MB -one_byte_emit_remainder_encodeBlockAsm12B: +one_byte_emit_remainder_encodeBlockAsm4MB: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBlockAsm12B: +memmove_emit_remainder_encodeBlockAsm4MB: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -2081,80 +2296,70 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBlockAsm12B: +memmove_end_copy_emit_remainder_encodeBlockAsm4MB: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB -memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBlockAsm4MB: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBlockAsm12B: +emit_literal_done_emit_remainder_encodeBlockAsm4MB: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBlockAsm10B(dst []byte, src []byte) int +// func encodeBlockAsm12B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBlockAsm10B(SB), $4120-56 +TEXT ·encodeBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX + MOVQ $0x00000080, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBlockAsm10B: +zero_loop_encodeBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -2165,7 +2370,7 @@ zero_loop_encodeBlockAsm10B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBlockAsm10B + JNZ zero_loop_encodeBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -2179,25 +2384,25 @@ zero_loop_encodeBlockAsm10B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBlockAsm10B: +search_loop_encodeBlockAsm12B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBlockAsm10B + JGE emit_remainder_encodeBlockAsm12B MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 + MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 - SHLQ $0x20, R9 + SHLQ $0x18, R9 IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 + SHRQ $0x34, R9 + SHLQ $0x18, R10 IMULQ R8, R10 - SHRQ $0x36, R10 + SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BP MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) @@ -2205,113 +2410,115 @@ search_loop_encodeBlockAsm10B: MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 - SHLQ $0x20, R9 + SHLQ $0x18, R9 IMULQ R8, R9 - SHRQ $0x36, R9 + SHRQ $0x34, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm10B + JNE no_repeat_found_encodeBlockAsm12B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBlockAsm10B + JZ repeat_extend_back_end_encodeBlockAsm12B -repeat_extend_back_loop_encodeBlockAsm10B: +repeat_extend_back_loop_encodeBlockAsm12B: CMPL SI, DI - JLE repeat_extend_back_end_encodeBlockAsm10B + JLE repeat_extend_back_end_encodeBlockAsm12B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBlockAsm10B + JNE repeat_extend_back_end_encodeBlockAsm12B LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBlockAsm10B + JNZ repeat_extend_back_loop_encodeBlockAsm12B -repeat_extend_back_end_encodeBlockAsm10B: +repeat_extend_back_end_encodeBlockAsm12B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B + JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm10B + JLT one_byte_repeat_emit_encodeBlockAsm12B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm10B + JLT two_bytes_repeat_emit_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm10B + JMP memmove_long_repeat_emit_encodeBlockAsm12B -two_bytes_repeat_emit_encodeBlockAsm10B: +two_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBlockAsm10B - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -one_byte_repeat_emit_encodeBlockAsm10B: + JL memmove_repeat_emit_encodeBlockAsm12B + JMP memmove_long_repeat_emit_encodeBlockAsm12B + +one_byte_repeat_emit_encodeBlockAsm12B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBlockAsm10B: +memmove_repeat_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -2321,65 +2528,55 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBlockAsm10B: +memmove_end_copy_repeat_emit_encodeBlockAsm12B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm10B + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B -memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), BP +memmove_long_repeat_emit_encodeBlockAsm12B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBlockAsm10B: +emit_literal_done_repeat_emit_encodeBlockAsm12B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -2387,78 +2584,82 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm10B + JL matchlen_single_repeat_extend_encodeBlockAsm12B -matchlen_loopback_repeat_extend_encodeBlockAsm10B: +matchlen_loopback_repeat_extend_encodeBlockAsm12B: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm10B + JZ matchlen_loop_repeat_extend_encodeBlockAsm12B BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm10B + JMP repeat_extend_forward_end_encodeBlockAsm12B -matchlen_loop_repeat_extend_encodeBlockAsm10B: +matchlen_loop_repeat_extend_encodeBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B + JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B -matchlen_single_repeat_extend_encodeBlockAsm10B: +matchlen_single_repeat_extend_encodeBlockAsm12B: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBlockAsm10B + JZ repeat_extend_forward_end_encodeBlockAsm12B -matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: +matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm10B + JNE repeat_extend_forward_end_encodeBlockAsm12B LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B -repeat_extend_forward_end_encodeBlockAsm10B: +repeat_extend_forward_end_encodeBlockAsm12B: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm10B - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm10B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm10B + JZ repeat_as_copy_encodeBlockAsm12B -cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm12B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm12B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm10B + JLT repeat_three_match_repeat_encodeBlockAsm12B LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_three_match_repeat_encodeBlockAsm10B: +repeat_three_match_repeat_encodeBlockAsm12B: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_match_repeat_encodeBlockAsm10B: +repeat_two_match_repeat_encodeBlockAsm12B: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_offset_match_repeat_encodeBlockAsm10B: +repeat_two_offset_match_repeat_encodeBlockAsm12B: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -2467,49 +2668,52 @@ repeat_two_offset_match_repeat_encodeBlockAsm10B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_as_copy_encodeBlockAsm10B: -two_byte_offset_repeat_as_copy_encodeBlockAsm10B: +repeat_as_copy_encodeBlockAsm12B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX + + // emitRepeat MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -2518,14 +2722,14 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B -two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: +two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -2534,150 +2738,152 @@ two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -emit_copy_three_repeat_as_copy_encodeBlockAsm10B: +emit_copy_three_repeat_as_copy_encodeBlockAsm12B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBlockAsm10B: +repeat_end_emit_encodeBlockAsm12B: MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm10B + JMP search_loop_encodeBlockAsm12B -no_repeat_found_encodeBlockAsm10B: +no_repeat_found_encodeBlockAsm12B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBlockAsm10B + JEQ candidate_match_encodeBlockAsm12B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BP LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm10B + JEQ candidate2_match_encodeBlockAsm12B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeBlockAsm10B + JEQ candidate3_match_encodeBlockAsm12B MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm10B + JMP search_loop_encodeBlockAsm12B -candidate3_match_encodeBlockAsm10B: +candidate3_match_encodeBlockAsm12B: ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm10B + JMP candidate_match_encodeBlockAsm12B -candidate2_match_encodeBlockAsm10B: +candidate2_match_encodeBlockAsm12B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BP -candidate_match_encodeBlockAsm10B: +candidate_match_encodeBlockAsm12B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBlockAsm10B + JZ match_extend_back_end_encodeBlockAsm12B -match_extend_back_loop_encodeBlockAsm10B: +match_extend_back_loop_encodeBlockAsm12B: CMPL CX, SI - JLE match_extend_back_end_encodeBlockAsm10B + JLE match_extend_back_end_encodeBlockAsm12B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBlockAsm10B + JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBlockAsm10B - JMP match_extend_back_loop_encodeBlockAsm10B + JZ match_extend_back_end_encodeBlockAsm12B + JMP match_extend_back_loop_encodeBlockAsm12B -match_extend_back_end_encodeBlockAsm10B: +match_extend_back_end_encodeBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBlockAsm10B + JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBlockAsm10B: +match_dst_size_check_encodeBlockAsm12B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm10B + JEQ emit_literal_done_match_emit_encodeBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm10B + JLT one_byte_match_emit_encodeBlockAsm12B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm10B + JLT two_bytes_match_emit_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm10B + JMP memmove_long_match_emit_encodeBlockAsm12B -two_bytes_match_emit_encodeBlockAsm10B: +two_bytes_match_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBlockAsm10B - JMP memmove_long_match_emit_encodeBlockAsm10B + JL memmove_match_emit_encodeBlockAsm12B + JMP memmove_long_match_emit_encodeBlockAsm12B -one_byte_match_emit_encodeBlockAsm10B: +one_byte_match_emit_encodeBlockAsm12B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBlockAsm10B: +memmove_match_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -2687,66 +2893,56 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBlockAsm10B: +memmove_end_copy_match_emit_encodeBlockAsm12B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm10B + JMP emit_literal_done_match_emit_encodeBlockAsm12B -memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeBlockAsm12B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBlockAsm10B: -match_nolit_loop_encodeBlockAsm10B: +emit_literal_done_match_emit_encodeBlockAsm12B: +match_nolit_loop_encodeBlockAsm12B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -2756,84 +2952,89 @@ match_nolit_loop_encodeBlockAsm10B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm10B + JL matchlen_single_match_nolit_encodeBlockAsm12B -matchlen_loopback_match_nolit_encodeBlockAsm10B: +matchlen_loopback_match_nolit_encodeBlockAsm12B: MOVQ (DI)(R9*1), R8 XORQ (BP)(R9*1), R8 TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm10B + JZ matchlen_loop_match_nolit_encodeBlockAsm12B BSFQ R8, R8 SARQ $0x03, R8 LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm10B + JMP match_nolit_end_encodeBlockAsm12B -matchlen_loop_match_nolit_encodeBlockAsm10B: +matchlen_loop_match_nolit_encodeBlockAsm12B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm10B + JGE matchlen_loopback_match_nolit_encodeBlockAsm12B -matchlen_single_match_nolit_encodeBlockAsm10B: +matchlen_single_match_nolit_encodeBlockAsm12B: TESTL SI, SI - JZ match_nolit_end_encodeBlockAsm10B + JZ match_nolit_end_encodeBlockAsm12B -matchlen_single_loopback_match_nolit_encodeBlockAsm10B: +matchlen_single_loopback_match_nolit_encodeBlockAsm12B: MOVB (DI)(R9*1), R8 CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm10B + JNE match_nolit_end_encodeBlockAsm12B LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B -match_nolit_end_encodeBlockAsm10B: +match_nolit_end_encodeBlockAsm12B: ADDL R9, CX MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeBlockAsm10B: + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm12B: CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B MOVB $0xee, (AX) MOVW BP, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX + + // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL BP, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: CMPL R9, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) @@ -2842,14 +3043,14 @@ repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBlockAsm12B -two_byte_offset_short_match_nolit_encodeBlockAsm10B: +two_byte_offset_short_match_nolit_encodeBlockAsm12B: CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm10B + JGE emit_copy_three_match_nolit_encodeBlockAsm12B CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm10B + JGE emit_copy_three_match_nolit_encodeBlockAsm12B MOVB $0x01, BL LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) @@ -2858,136 +3059,138 @@ two_byte_offset_short_match_nolit_encodeBlockAsm10B: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -emit_copy_three_match_nolit_encodeBlockAsm10B: +emit_copy_three_match_nolit_encodeBlockAsm12B: MOVB $0x02, BL LEAL -4(BX)(R9*4), R9 MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBlockAsm10B: +match_nolit_emitcopy_end_encodeBlockAsm12B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm10B + JGE emit_remainder_encodeBlockAsm12B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm10B + JL match_nolit_dst_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R8 +match_nolit_dst_ok_encodeBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BP - SHLQ $0x20, DI + SHLQ $0x18, DI IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BP + SHRQ $0x34, DI + SHLQ $0x18, BP IMULQ R8, BP - SHRQ $0x36, BP + SHRQ $0x34, BP LEAL -2(CX), R8 LEAQ 24(SP)(BP*4), R9 MOVL (R9), BP MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeBlockAsm10B + JEQ match_nolit_loop_encodeBlockAsm12B INCL CX - JMP search_loop_encodeBlockAsm10B + JMP search_loop_encodeBlockAsm12B -emit_remainder_encodeBlockAsm10B: +emit_remainder_encodeBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm10B + JL emit_remainder_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBlockAsm10B: +emit_remainder_ok_encodeBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B + JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm10B + JLT one_byte_emit_remainder_encodeBlockAsm12B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm10B + JLT two_bytes_emit_remainder_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm10B + JMP memmove_long_emit_remainder_encodeBlockAsm12B -two_bytes_emit_remainder_encodeBlockAsm10B: +two_bytes_emit_remainder_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm10B - JMP memmove_long_emit_remainder_encodeBlockAsm10B + JL memmove_emit_remainder_encodeBlockAsm12B + JMP memmove_long_emit_remainder_encodeBlockAsm12B -one_byte_emit_remainder_encodeBlockAsm10B: +one_byte_emit_remainder_encodeBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBlockAsm10B: +memmove_emit_remainder_encodeBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -2997,80 +3200,70 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBlockAsm10B: +memmove_end_copy_emit_remainder_encodeBlockAsm12B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm10B + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B -memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBlockAsm10B: +emit_literal_done_emit_remainder_encodeBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBlockAsm8B(dst []byte, src []byte) int +// func encodeBlockAsm10B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBlockAsm8B(SB), $1048-56 +TEXT ·encodeBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX + MOVQ $0x00000020, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBlockAsm8B: +zero_loop_encodeBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -3081,7 +3274,7 @@ zero_loop_encodeBlockAsm8B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBlockAsm8B + JNZ zero_loop_encodeBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -3095,14 +3288,14 @@ zero_loop_encodeBlockAsm8B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBlockAsm8B: +search_loop_encodeBlockAsm10B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x04, BP + SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBlockAsm8B + JGE emit_remainder_encodeBlockAsm10B MOVL BP, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 @@ -3110,10 +3303,10 @@ search_loop_encodeBlockAsm8B: SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 - SHRQ $0x38, R9 + SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ R8, R10 - SHRQ $0x38, R10 + SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BP MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) @@ -3123,111 +3316,113 @@ search_loop_encodeBlockAsm8B: SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 - SHRQ $0x38, R9 + SHRQ $0x36, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm8B + JNE no_repeat_found_encodeBlockAsm10B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBlockAsm8B + JZ repeat_extend_back_end_encodeBlockAsm10B -repeat_extend_back_loop_encodeBlockAsm8B: +repeat_extend_back_loop_encodeBlockAsm10B: CMPL SI, DI - JLE repeat_extend_back_end_encodeBlockAsm8B + JLE repeat_extend_back_end_encodeBlockAsm10B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBlockAsm8B + JNE repeat_extend_back_end_encodeBlockAsm10B LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBlockAsm8B + JNZ repeat_extend_back_loop_encodeBlockAsm10B -repeat_extend_back_end_encodeBlockAsm8B: +repeat_extend_back_end_encodeBlockAsm10B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B + JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm8B + JLT one_byte_repeat_emit_encodeBlockAsm10B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm8B + JLT two_bytes_repeat_emit_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm8B + JMP memmove_long_repeat_emit_encodeBlockAsm10B -two_bytes_repeat_emit_encodeBlockAsm8B: +two_bytes_repeat_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBlockAsm8B - JMP memmove_long_repeat_emit_encodeBlockAsm8B + JL memmove_repeat_emit_encodeBlockAsm10B + JMP memmove_long_repeat_emit_encodeBlockAsm10B -one_byte_repeat_emit_encodeBlockAsm8B: +one_byte_repeat_emit_encodeBlockAsm10B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBlockAsm8B: +memmove_repeat_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -3237,65 +3432,55 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBlockAsm8B: +memmove_end_copy_repeat_emit_encodeBlockAsm10B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm8B + JMP emit_literal_done_repeat_emit_encodeBlockAsm10B -memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), BP +memmove_long_repeat_emit_encodeBlockAsm10B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBlockAsm8B: +emit_literal_done_repeat_emit_encodeBlockAsm10B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -3303,74 +3488,82 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm8B + JL matchlen_single_repeat_extend_encodeBlockAsm10B -matchlen_loopback_repeat_extend_encodeBlockAsm8B: +matchlen_loopback_repeat_extend_encodeBlockAsm10B: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm8B + JZ matchlen_loop_repeat_extend_encodeBlockAsm10B BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm8B + JMP repeat_extend_forward_end_encodeBlockAsm10B -matchlen_loop_repeat_extend_encodeBlockAsm8B: +matchlen_loop_repeat_extend_encodeBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B + JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B -matchlen_single_repeat_extend_encodeBlockAsm8B: +matchlen_single_repeat_extend_encodeBlockAsm10B: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBlockAsm8B + JZ repeat_extend_forward_end_encodeBlockAsm10B -matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: +matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm8B + JNE repeat_extend_forward_end_encodeBlockAsm10B LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B -repeat_extend_forward_end_encodeBlockAsm8B: +repeat_extend_forward_end_encodeBlockAsm10B: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm8B - MOVL BP, SI - LEAL -4(BP), BP - CMPL SI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm8B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + JZ repeat_as_copy_encodeBlockAsm10B -cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm10B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm10B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm8B + JLT repeat_three_match_repeat_encodeBlockAsm10B LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_three_match_repeat_encodeBlockAsm8B: +repeat_three_match_repeat_encodeBlockAsm10B: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_two_match_repeat_encodeBlockAsm8B: +repeat_two_match_repeat_encodeBlockAsm10B: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_match_repeat_encodeBlockAsm10B: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -3379,45 +3572,52 @@ repeat_two_match_repeat_encodeBlockAsm8B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_as_copy_encodeBlockAsm8B: -two_byte_offset_repeat_as_copy_encodeBlockAsm8B: +repeat_as_copy_encodeBlockAsm10B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm10B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - MOVL BP, SI + + // emitRepeat + MOVL BP, DI LEAL -4(BP), BP - CMPL SI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -3426,12 +3626,14 @@ repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B -two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: +two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -3440,150 +3642,152 @@ two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -emit_copy_three_repeat_as_copy_encodeBlockAsm8B: +emit_copy_three_repeat_as_copy_encodeBlockAsm10B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBlockAsm8B: +repeat_end_emit_encodeBlockAsm10B: MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm8B + JMP search_loop_encodeBlockAsm10B -no_repeat_found_encodeBlockAsm8B: +no_repeat_found_encodeBlockAsm10B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBlockAsm8B + JEQ candidate_match_encodeBlockAsm10B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BP LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm8B + JEQ candidate2_match_encodeBlockAsm10B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeBlockAsm8B + JEQ candidate3_match_encodeBlockAsm10B MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm8B + JMP search_loop_encodeBlockAsm10B -candidate3_match_encodeBlockAsm8B: +candidate3_match_encodeBlockAsm10B: ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm8B + JMP candidate_match_encodeBlockAsm10B -candidate2_match_encodeBlockAsm8B: +candidate2_match_encodeBlockAsm10B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BP -candidate_match_encodeBlockAsm8B: +candidate_match_encodeBlockAsm10B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBlockAsm8B + JZ match_extend_back_end_encodeBlockAsm10B -match_extend_back_loop_encodeBlockAsm8B: +match_extend_back_loop_encodeBlockAsm10B: CMPL CX, SI - JLE match_extend_back_end_encodeBlockAsm8B + JLE match_extend_back_end_encodeBlockAsm10B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBlockAsm8B + JNE match_extend_back_end_encodeBlockAsm10B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBlockAsm8B - JMP match_extend_back_loop_encodeBlockAsm8B + JZ match_extend_back_end_encodeBlockAsm10B + JMP match_extend_back_loop_encodeBlockAsm10B -match_extend_back_end_encodeBlockAsm8B: +match_extend_back_end_encodeBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBlockAsm8B + JL match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBlockAsm8B: +match_dst_size_check_encodeBlockAsm10B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm8B + JEQ emit_literal_done_match_emit_encodeBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm8B + JLT one_byte_match_emit_encodeBlockAsm10B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm8B + JLT two_bytes_match_emit_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm8B + JMP memmove_long_match_emit_encodeBlockAsm10B -two_bytes_match_emit_encodeBlockAsm8B: +two_bytes_match_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBlockAsm8B - JMP memmove_long_match_emit_encodeBlockAsm8B + JL memmove_match_emit_encodeBlockAsm10B + JMP memmove_long_match_emit_encodeBlockAsm10B -one_byte_match_emit_encodeBlockAsm8B: +one_byte_match_emit_encodeBlockAsm10B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBlockAsm8B: +memmove_match_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -3593,66 +3797,56 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBlockAsm8B: +memmove_end_copy_match_emit_encodeBlockAsm10B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm8B + JMP emit_literal_done_match_emit_encodeBlockAsm10B -memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeBlockAsm10B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBlockAsm8B: -match_nolit_loop_encodeBlockAsm8B: +emit_literal_done_match_emit_encodeBlockAsm10B: +match_nolit_loop_encodeBlockAsm10B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -3662,232 +3856,6202 @@ match_nolit_loop_encodeBlockAsm8B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm8B + JL matchlen_single_match_nolit_encodeBlockAsm10B + +matchlen_loopback_match_nolit_encodeBlockAsm10B: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeBlockAsm10B + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeBlockAsm10B + +matchlen_loop_match_nolit_encodeBlockAsm10B: + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm10B + +matchlen_single_match_nolit_encodeBlockAsm10B: + TESTL SI, SI + JZ match_nolit_end_encodeBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBlockAsm10B: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeBlockAsm10B + LEAL 1(R9), R9 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B + +match_nolit_end_encodeBlockAsm10B: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm10B: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R9), R9 + ADDQ $0x03, AX + + // emitRepeat + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: + CMPL R9, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short + LEAL -256(R9), R9 + MOVW $0x0019, (AX) + MOVW R9, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: + LEAL -4(R9), R9 + MOVW $0x0015, (AX) + MOVB R9, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBlockAsm10B: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +emit_copy_three_match_nolit_encodeBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm10B + MOVQ -2(DX)(CX*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm10B: + MOVQ $0x9e3779b1, R8 + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x20, DI + IMULQ R8, DI + SHRQ $0x36, DI + SHLQ $0x20, BP + IMULQ R8, BP + SHRQ $0x36, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeBlockAsm10B + INCL CX + JMP search_loop_encodeBlockAsm10B + +emit_remainder_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +two_bytes_emit_remainder_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm10B + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +one_byte_emit_remainder_encodeBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm10B + +memmove_long_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm8B(SB), $1048-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000008, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -5(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm8B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 4(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 + JNE no_repeat_found_encodeBlockAsm8B + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBlockAsm8B + +repeat_extend_back_loop_encodeBlockAsm8B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBlockAsm8B + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBlockAsm8B + +repeat_extend_back_end_encodeBlockAsm8B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm8B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +two_bytes_repeat_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBlockAsm8B + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +one_byte_repeat_emit_encodeBlockAsm8B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm8B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm8B + +memmove_long_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBlockAsm8B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_loop_repeat_extend_encodeBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B + +matchlen_single_repeat_extend_encodeBlockAsm8B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBlockAsm8B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B + +repeat_extend_forward_end_encodeBlockAsm8B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm8B + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm8B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_match_repeat_encodeBlockAsm8B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_match_repeat_encodeBlockAsm8B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_as_copy_encodeBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm8B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm8B + +no_repeat_found_encodeBlockAsm8B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBlockAsm8B + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI + JEQ candidate2_match_encodeBlockAsm8B + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI + JEQ candidate3_match_encodeBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm8B + +candidate3_match_encodeBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm8B + +candidate2_match_encodeBlockAsm8B: + MOVL R8, 24(SP)(R9*4) + INCL CX + MOVL DI, BP + +candidate_match_encodeBlockAsm8B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBlockAsm8B + +match_extend_back_loop_encodeBlockAsm8B: + CMPL CX, SI + JLE match_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBlockAsm8B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBlockAsm8B + JMP match_extend_back_loop_encodeBlockAsm8B + +match_extend_back_end_encodeBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 3(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm8B: + MOVL CX, SI + MOVL 12(SP), DI + CMPL DI, SI + JEQ emit_literal_done_match_emit_encodeBlockAsm8B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + LEAL -1(R8), DI + CMPL DI, $0x3c + JLT one_byte_match_emit_encodeBlockAsm8B + CMPL DI, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm8B + +two_bytes_match_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DI, 1(AX) + ADDQ $0x02, AX + CMPL DI, $0x40 + JL memmove_match_emit_encodeBlockAsm8B + JMP memmove_long_match_emit_encodeBlockAsm8B + +one_byte_match_emit_encodeBlockAsm8B: + SHLB $0x02, DI + MOVB DI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), DI + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2: + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3: + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7: + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_encodeBlockAsm8B: + MOVQ DI, AX + JMP emit_literal_done_match_emit_encodeBlockAsm8B + +memmove_long_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVQ R8, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(SI)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(SI)(R11*1), X4 + MOVOU -16(SI)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ R8, R11 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ DI, AX + +emit_literal_done_match_emit_encodeBlockAsm8B: +match_nolit_loop_encodeBlockAsm8B: + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R9, R9 + CMPL SI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm8B + +matchlen_loopback_match_nolit_encodeBlockAsm8B: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeBlockAsm8B + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeBlockAsm8B + +matchlen_loop_match_nolit_encodeBlockAsm8B: + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm8B + +matchlen_single_match_nolit_encodeBlockAsm8B: + TESTL SI, SI + JZ match_nolit_end_encodeBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBlockAsm8B: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeBlockAsm8B + LEAL 1(R9), R9 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B + +match_nolit_end_encodeBlockAsm8B: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm8B: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R9), R9 + ADDQ $0x03, AX + + // emitRepeat + MOVL R9, BP + LEAL -4(R9), R9 + CMPL BP, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short + CMPL BP, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: + CMPL R9, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short + LEAL -256(R9), R9 + MOVW $0x0019, (AX) + MOVW R9, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: + LEAL -4(R9), R9 + MOVW $0x0015, (AX) + MOVB R9, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBlockAsm8B: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +emit_copy_three_match_nolit_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVQ -2(DX)(CX*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm8B: + MOVQ $0x9e3779b1, R8 + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x20, DI + IMULQ R8, DI + SHRQ $0x38, DI + SHLQ $0x20, BP + IMULQ R8, BP + SHRQ $0x38, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeBlockAsm8B + INCL CX + JMP search_loop_encodeBlockAsm8B + +emit_remainder_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +two_bytes_emit_remainder_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm8B + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +one_byte_emit_remainder_encodeBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm8B + +memmove_long_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x07, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + MOVL BP, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 262168(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 262168(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm + +repeat_extend_back_loop_encodeBetterBlockAsm: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm + +repeat_extend_back_end_encodeBetterBlockAsm: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x00010000 + JLT three_bytes_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x01000000 + JLT four_bytes_repeat_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +four_bytes_repeat_emit_encodeBetterBlockAsm: + MOVL BP, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +three_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +two_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +one_byte_repeat_emit_encodeBetterBlockAsm: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm + +memmove_long_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_loop_repeat_extend_encodeBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + +matchlen_single_repeat_extend_encodeBetterBlockAsm: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm + +repeat_extend_forward_end_encodeBetterBlockAsm: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_match_repeat_encodeBetterBlockAsm: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm + CMPL BP, $0x00010100 + JLT repeat_four_match_repeat_encodeBetterBlockAsm + CMPL BP, $0x0100ffff + JLT repeat_five_match_repeat_encodeBetterBlockAsm + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_repeat_encodeBetterBlockAsm + +repeat_five_match_repeat_encodeBetterBlockAsm: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_match_repeat_encodeBetterBlockAsm: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_match_repeat_encodeBetterBlockAsm: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_match_repeat_encodeBetterBlockAsm: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_match_repeat_encodeBetterBlockAsm: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_as_copy_encodeBetterBlockAsm: + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy + +repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm: + TESTL BP, BP + JZ repeat_end_emit_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + +repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm + +no_repeat_found_encodeBetterBlockAsm: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm + +candidateS_match_encodeBetterBlockAsm: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm + +match_extend_back_loop_encodeBetterBlockAsm: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm + JMP match_extend_back_loop_encodeBetterBlockAsm + +match_extend_back_end_encodeBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 5(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm + +matchlen_loopback_match_nolit_encodeBetterBlockAsm: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_loop_match_nolit_encodeBetterBlockAsm: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + +matchlen_single_match_nolit_encodeBetterBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + +match_nolit_end_encodeBetterBlockAsm: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm + +match_length_ok_encodeBetterBlockAsm: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm + CMPL BP, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm + CMPL BP, $0x01000000 + JLT four_bytes_match_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +four_bytes_match_emit_encodeBetterBlockAsm: + MOVL BP, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +three_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +two_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm + +one_byte_match_emit_encodeBetterBlockAsm: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm + +memmove_long_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm: + ADDL R11, CX + MOVL 16(SP), BP + ADDL $0x04, R11 + MOVL CX, 12(SP) + + // emitCopy + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL BP, 1(AX) + LEAL -64(R11), R11 + ADDQ $0x05, AX + CMPL R11, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R11, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy + LEAL -16842747(R11), R11 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + +four_bytes_remain_match_nolit_encodeBetterBlockAsm: + TESTL R11, R11 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +two_byte_offset_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R11), R11 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R11, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(R11), R11 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R11*4), R11 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +emit_copy_three_match_nolit_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 262168(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL -2(CX), R8 + LEAL -1(CX), SI + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 262168(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm + +emit_remainder_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +four_bytes_emit_remainder_encodeBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +three_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +two_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +one_byte_emit_remainder_encodeBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm + +memmove_long_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm4MB: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm4MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm4MB: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x07, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm4MB + MOVL BP, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 262168(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 262168(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm4MB + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm4MB + +repeat_extend_back_loop_encodeBetterBlockAsm4MB: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm4MB + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm4MB + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm4MB + +repeat_extend_back_end_encodeBetterBlockAsm4MB: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm4MB + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm4MB + CMPL BP, $0x00010000 + JLT three_bytes_repeat_emit_encodeBetterBlockAsm4MB + MOVL BP, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB + +three_bytes_repeat_emit_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB + +two_bytes_repeat_emit_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB + +one_byte_repeat_emit_encodeBetterBlockAsm4MB: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB + +memmove_long_repeat_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm4MB + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm4MB + +matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB + +matchlen_single_repeat_extend_encodeBetterBlockAsm4MB: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm4MB + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm4MB: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm4MB + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm4MB + +repeat_extend_forward_end_encodeBetterBlockAsm4MB: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBetterBlockAsm4MB + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm4MB + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm4MB + CMPL BP, $0x00010100 + JLT repeat_four_match_repeat_encodeBetterBlockAsm4MB + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_four_match_repeat_encodeBetterBlockAsm4MB: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_three_match_repeat_encodeBetterBlockAsm4MB: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_match_repeat_encodeBetterBlockAsm4MB: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_as_copy_encodeBetterBlockAsm4MB: + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm4MB + +four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm4MB: + CMPL BP, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm4MB + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm4MB + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm4MB + +four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm4MB: + TESTL BP, BP + JZ repeat_end_emit_encodeBetterBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm4MB: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm4MB + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm4MB + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm4MB: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm4MB + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm4MB: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm4MB + +no_repeat_found_encodeBetterBlockAsm4MB: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm4MB + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm4MB + +candidateS_match_encodeBetterBlockAsm4MB: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm4MB + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm4MB: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm4MB + +match_extend_back_loop_encodeBetterBlockAsm4MB: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm4MB + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm4MB + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm4MB + JMP match_extend_back_loop_encodeBetterBlockAsm4MB + +match_extend_back_end_encodeBetterBlockAsm4MB: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm4MB: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB + +matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB + +matchlen_single_match_nolit_encodeBetterBlockAsm4MB: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm4MB + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB + +match_nolit_end_encodeBetterBlockAsm4MB: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm4MB + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm4MB + +match_length_ok_encodeBetterBlockAsm4MB: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm4MB + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm4MB + CMPL BP, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm4MB + MOVL BP, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +three_bytes_match_emit_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +two_bytes_match_emit_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm4MB + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +one_byte_match_emit_encodeBetterBlockAsm4MB: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(DI*1), BP + + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB + +memmove_long_match_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm4MB: + ADDL R11, CX + MOVL 16(SP), BP + ADDL $0x04, R11 + MOVL CX, 12(SP) + + // emitCopy + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: + CMPL R11, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB + MOVB $0xff, (AX) + MOVL BP, 1(AX) + LEAL -64(R11), R11 + ADDQ $0x05, AX + CMPL R11, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB + + // emitRepeat + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: + TESTL R11, R11 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R11), R11 + ADDQ $0x03, AX + + // emitRepeat + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(R11*4), R11 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm4MB + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm4MB: + MOVQ $0x00cf1bbcdcbfa563, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 262168(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL -2(CX), R8 + LEAL -1(CX), SI + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 262168(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm4MB + +emit_remainder_encodeBetterBlockAsm4MB: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm4MB: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +three_bytes_emit_remainder_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +two_bytes_emit_remainder_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +one_byte_emit_remainder_encodeBetterBlockAsm4MB: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm4MB: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB + +memmove_long_emit_remainder_encodeBetterBlockAsm4MB: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm12B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x06, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R9*4), BP + MOVL 65560(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 65560(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm12B + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm12B + +repeat_extend_back_loop_encodeBetterBlockAsm12B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm12B + +repeat_extend_back_end_encodeBetterBlockAsm12B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm12B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + +two_bytes_repeat_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + +one_byte_repeat_emit_encodeBetterBlockAsm12B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + +memmove_long_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm12B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm12B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B + +matchlen_single_repeat_extend_encodeBetterBlockAsm12B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm12B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B + +repeat_extend_forward_end_encodeBetterBlockAsm12B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBetterBlockAsm12B + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm12B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm12B + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm12B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_three_match_repeat_encodeBetterBlockAsm12B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_match_repeat_encodeBetterBlockAsm12B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_as_copy_encodeBetterBlockAsm12B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_three_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm12B + +no_repeat_found_encodeBetterBlockAsm12B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B + +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm12B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm12B + +match_extend_back_loop_encodeBetterBlockAsm12B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm12B + JMP match_extend_back_loop_encodeBetterBlockAsm12B + +match_extend_back_end_encodeBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 3(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm12B: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeBetterBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B + +matchlen_single_match_nolit_encodeBetterBlockAsm12B: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm12B + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B + +match_nolit_end_encodeBetterBlockAsm12B: + MOVL CX, DI + SUBL BP, DI + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm12B + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +two_bytes_match_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +one_byte_match_emit_encodeBetterBlockAsm12B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(DI*1), BP + + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm12B: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B + +memmove_long_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm12B: + ADDL R11, CX + MOVL 16(SP), BP + ADDL $0x04, R11 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm12B: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R11), R11 + ADDQ $0x03, AX + + // emitRepeat + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R11*4), R11 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x34, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 65560(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL -2(CX), R8 + LEAL -1(CX), SI + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x34, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 65560(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm12B + +emit_remainder_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +two_bytes_emit_remainder_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +one_byte_emit_remainder_encodeBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + +memmove_long_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm10B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R9*4), BP + MOVL 16408(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 16408(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm10B + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm10B + +repeat_extend_back_loop_encodeBetterBlockAsm10B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm10B + +repeat_extend_back_end_encodeBetterBlockAsm10B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm10B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + +two_bytes_repeat_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + +one_byte_repeat_emit_encodeBetterBlockAsm10B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + +memmove_long_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm10B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm10B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B + +matchlen_single_repeat_extend_encodeBetterBlockAsm10B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm10B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B + +repeat_extend_forward_end_encodeBetterBlockAsm10B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBetterBlockAsm10B + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm10B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm10B + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm10B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_three_match_repeat_encodeBetterBlockAsm10B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_match_repeat_encodeBetterBlockAsm10B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_as_copy_encodeBetterBlockAsm10B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_three_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm10B + +no_repeat_found_encodeBetterBlockAsm10B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B + +candidateS_match_encodeBetterBlockAsm10B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm10B + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm10B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm10B + +match_extend_back_loop_encodeBetterBlockAsm10B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm10B + JMP match_extend_back_loop_encodeBetterBlockAsm10B + +match_extend_back_end_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 3(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm10B: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeBetterBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B + +matchlen_single_match_nolit_encodeBetterBlockAsm10B: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm10B + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B + +match_nolit_end_encodeBetterBlockAsm10B: + MOVL CX, DI + SUBL BP, DI + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm10B + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +two_bytes_match_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +one_byte_match_emit_encodeBetterBlockAsm10B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(DI*1), BP + + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm10B: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B + +memmove_long_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm10B: + ADDL R11, CX + MOVL 16(SP), BP + ADDL $0x04, R11 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm10B: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R11), R11 + ADDQ $0x03, AX + + // emitRepeat + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R11*4), R11 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x36, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 16408(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL -2(CX), R8 + LEAL -1(CX), SI + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x36, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 16408(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm10B + +emit_remainder_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +two_bytes_emit_remainder_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +one_byte_emit_remainder_encodeBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + +memmove_long_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm8B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 4120(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 4120(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm8B + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm8B + +repeat_extend_back_loop_encodeBetterBlockAsm8B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm8B + +repeat_extend_back_end_encodeBetterBlockAsm8B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm8B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + +two_bytes_repeat_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + +one_byte_repeat_emit_encodeBetterBlockAsm8B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + +memmove_long_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B + +matchlen_single_repeat_extend_encodeBetterBlockAsm8B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm8B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B + +repeat_extend_forward_end_encodeBetterBlockAsm8B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBetterBlockAsm8B + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm8B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_three_match_repeat_encodeBetterBlockAsm8B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_two_match_repeat_encodeBetterBlockAsm8B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_as_copy_encodeBetterBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_three_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_two_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm8B + +no_repeat_found_encodeBetterBlockAsm8B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B + +candidateS_match_encodeBetterBlockAsm8B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm8B + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm8B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm8B + +match_extend_back_loop_encodeBetterBlockAsm8B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm8B + JMP match_extend_back_loop_encodeBetterBlockAsm8B + +match_extend_back_end_encodeBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 3(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm8B: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeBetterBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B + +matchlen_single_match_nolit_encodeBetterBlockAsm8B: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm8B + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B + +match_nolit_end_encodeBetterBlockAsm8B: + MOVL CX, DI + SUBL BP, DI + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm8B + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +two_bytes_match_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +one_byte_match_emit_encodeBetterBlockAsm8B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(DI*1), BP + + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) -matchlen_loopback_match_nolit_encodeBlockAsm8B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm8B +memmove_end_copy_match_emit_encodeBetterBlockAsm8B: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B -matchlen_loop_match_nolit_encodeBlockAsm8B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm8B +memmove_long_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(DI*1), BP -matchlen_single_match_nolit_encodeBlockAsm8B: - TESTL SI, SI - JZ match_nolit_end_encodeBlockAsm8B + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 -matchlen_single_loopback_match_nolit_encodeBlockAsm8B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(R9), R9 - DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back -match_nolit_end_encodeBlockAsm8B: - ADDL R9, CX +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm8B: + ADDL R11, CX MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL $0x04, R11 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeBlockAsm8B: - CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm8B: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(R9), R9 + LEAL -60(R11), R11 ADDQ $0x03, AX - MOVL R9, BP - LEAL -4(R9), R9 + + // emitRepeat + MOVL R11, BP + LEAL -4(R11), R11 CMPL BP, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short CMPL BP, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL R9, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -256(R9), R9 +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(R11), R11 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R11, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(R9), R9 +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(R11), R11 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R11, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + ORL BP, R11 + MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B -two_byte_offset_short_match_nolit_encodeBlockAsm8B: - CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm8B +two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 + LEAL -16(BX)(R11*4), R11 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + ORL BP, R11 + MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B -emit_copy_three_match_nolit_encodeBlockAsm8B: +emit_copy_three_match_nolit_encodeBetterBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBlockAsm8B: +match_nolit_emitcopy_end_encodeBetterBlockAsm8B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm8B - MOVQ -2(DX)(CX*1), SI + JGE emit_remainder_encodeBetterBlockAsm8B CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm8B + JL match_nolit_dst_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x38, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x38, BP +match_nolit_dst_ok_encodeBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x38, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 4120(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeBlockAsm8B - INCL CX - JMP search_loop_encodeBlockAsm8B + LEAL -1(CX), SI + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x38, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 4120(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm8B -emit_remainder_encodeBlockAsm8B: +emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm8B + JL emit_remainder_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBlockAsm8B: +emit_remainder_ok_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm8B + JLT one_byte_emit_remainder_encodeBetterBlockAsm8B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm8B + JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B -two_bytes_emit_remainder_encodeBlockAsm8B: +two_bytes_emit_remainder_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm8B - JMP memmove_long_emit_remainder_encodeBlockAsm8B + JL memmove_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B -one_byte_emit_remainder_encodeBlockAsm8B: +one_byte_emit_remainder_encodeBetterBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBlockAsm8B: +memmove_emit_remainder_encodeBetterBlockAsm8B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -3897,66 +10061,56 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBlockAsm8B: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm8B + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B -memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBlockAsm8B: +emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) @@ -4100,6 +10254,8 @@ one_byte_repeat_emit_encodeSnappyBlockAsm: memmove_repeat_emit_encodeSnappyBlockAsm: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3 @@ -4161,13 +10317,15 @@ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -4180,23 +10338,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back @@ -4222,6 +10368,8 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm @@ -4259,6 +10407,8 @@ repeat_extend_forward_end_encodeSnappyBlockAsm: MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI + + // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm @@ -4362,7 +10512,7 @@ match_extend_back_loop_encodeSnappyBlockAsm: match_extend_back_end_encodeSnappyBlockAsm: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -4421,6 +10571,8 @@ one_byte_match_emit_encodeSnappyBlockAsm: memmove_match_emit_encodeSnappyBlockAsm: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3 @@ -4482,13 +10634,15 @@ memmove_end_copy_match_emit_encodeSnappyBlockAsm: JMP emit_literal_done_match_emit_encodeSnappyBlockAsm memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -4501,23 +10655,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back @@ -4546,6 +10688,8 @@ match_nolit_loop_encodeSnappyBlockAsm: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm @@ -4583,6 +10727,8 @@ match_nolit_end_encodeSnappyBlockAsm: MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) + + // emitCopy CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm @@ -4671,7 +10817,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm: emit_remainder_encodeSnappyBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -4731,6 +10877,8 @@ one_byte_emit_remainder_encodeSnappyBlockAsm: memmove_emit_remainder_encodeSnappyBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 @@ -4792,14 +10940,16 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -4812,23 +10962,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back @@ -4971,6 +11109,8 @@ one_byte_repeat_emit_encodeSnappyBlockAsm12B: memmove_repeat_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3 @@ -5032,13 +11172,15 @@ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -5051,23 +11193,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm12B: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back @@ -5093,6 +11223,8 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B @@ -5131,6 +11263,7 @@ repeat_extend_forward_end_encodeSnappyBlockAsm12B: SUBL SI, BP MOVL 16(SP), SI + // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B @@ -5210,7 +11343,7 @@ match_extend_back_loop_encodeSnappyBlockAsm12B: match_extend_back_end_encodeSnappyBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -5250,6 +11383,8 @@ one_byte_match_emit_encodeSnappyBlockAsm12B: memmove_match_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3 @@ -5311,13 +11446,15 @@ memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -5330,23 +11467,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm12B: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back @@ -5375,6 +11500,8 @@ match_nolit_loop_encodeSnappyBlockAsm12B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B @@ -5413,6 +11540,7 @@ match_nolit_end_encodeSnappyBlockAsm12B: ADDL $0x04, R9 MOVL CX, 12(SP) + // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B @@ -5477,7 +11605,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm12B: emit_remainder_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -5518,6 +11646,8 @@ one_byte_emit_remainder_encodeSnappyBlockAsm12B: memmove_emit_remainder_encodeSnappyBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 @@ -5579,14 +11709,16 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -5599,23 +11731,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm12B: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back @@ -5758,6 +11878,8 @@ one_byte_repeat_emit_encodeSnappyBlockAsm10B: memmove_repeat_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3 @@ -5819,13 +11941,15 @@ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -5838,23 +11962,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm10B: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back @@ -5880,6 +11992,8 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B @@ -5918,6 +12032,7 @@ repeat_extend_forward_end_encodeSnappyBlockAsm10B: SUBL SI, BP MOVL 16(SP), SI + // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B @@ -5997,7 +12112,7 @@ match_extend_back_loop_encodeSnappyBlockAsm10B: match_extend_back_end_encodeSnappyBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -6037,6 +12152,8 @@ one_byte_match_emit_encodeSnappyBlockAsm10B: memmove_match_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3 @@ -6098,13 +12215,15 @@ memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -6117,23 +12236,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm10B: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back @@ -6162,6 +12269,8 @@ match_nolit_loop_encodeSnappyBlockAsm10B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B @@ -6200,6 +12309,7 @@ match_nolit_end_encodeSnappyBlockAsm10B: ADDL $0x04, R9 MOVL CX, 12(SP) + // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B @@ -6264,7 +12374,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm10B: emit_remainder_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -6305,6 +12415,8 @@ one_byte_emit_remainder_encodeSnappyBlockAsm10B: memmove_emit_remainder_encodeSnappyBlockAsm10B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 @@ -6366,14 +12478,16 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -6386,23 +12500,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm10B: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back @@ -6545,6 +12647,8 @@ one_byte_repeat_emit_encodeSnappyBlockAsm8B: memmove_repeat_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3 @@ -6606,13 +12710,15 @@ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -6625,23 +12731,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm8B: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back @@ -6667,6 +12761,8 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B @@ -6705,6 +12801,7 @@ repeat_extend_forward_end_encodeSnappyBlockAsm8B: SUBL SI, BP MOVL 16(SP), SI + // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B @@ -6782,7 +12879,7 @@ match_extend_back_loop_encodeSnappyBlockAsm8B: match_extend_back_end_encodeSnappyBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -6822,6 +12919,8 @@ one_byte_match_emit_encodeSnappyBlockAsm8B: memmove_match_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3 @@ -6883,13 +12982,15 @@ memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -6902,23 +13003,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm8B: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back @@ -6947,6 +13036,8 @@ match_nolit_loop_encodeSnappyBlockAsm8B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B @@ -6985,6 +13076,7 @@ match_nolit_end_encodeSnappyBlockAsm8B: ADDL $0x04, R9 MOVL CX, 12(SP) + // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B @@ -7047,7 +13139,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm8B: emit_remainder_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -7088,6 +13180,8 @@ one_byte_emit_remainder_encodeSnappyBlockAsm8B: memmove_emit_remainder_encodeSnappyBlockAsm8B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 @@ -7149,14 +13243,16 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -7169,23 +13265,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm8B: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back @@ -7266,6 +13350,7 @@ one_byte_standalone: ADDQ $0x01, AX memmove_standalone: + // genMemMoveShort CMPQ DX, $0x03 JB emit_lit_memmove_standalone_memmove_move_1or2 JE emit_lit_memmove_standalone_memmove_move_3 @@ -7325,12 +13410,13 @@ emit_lit_memmove_standalone_memmove_move_33through64: JMP emit_literal_end_standalone memmove_long_standalone: + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVQ DX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -7343,23 +13429,11 @@ memmove_long_standalone: emit_lit_memmove_long_standalonelarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_standalonelarge_big_loop_back @@ -7392,6 +13466,7 @@ TEXT ·emitRepeat(SB), NOSPLIT, $0-48 MOVQ offset+24(FP), CX MOVQ length+32(FP), DX + // emitRepeat emit_repeat_again_standalone: MOVL DX, BP LEAL -4(DX), DX @@ -7473,6 +13548,8 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48 MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX + + // emitCopy CMPL CX, $0x00010000 JL two_byte_offset_standalone @@ -7487,6 +13564,7 @@ four_bytes_loop_back_standalone: CMPL DX, $0x04 JL four_bytes_remain_standalone + // emitRepeat emit_repeat_again_standalone_emit_copy: MOVL DX, BP LEAL -4(DX), DX @@ -7580,6 +13658,7 @@ two_byte_offset_standalone: ADDQ $0x03, AX ADDQ $0x03, BX + // emitRepeat emit_repeat_again_standalone_emit_copy_short: MOVL DX, BP LEAL -4(DX), DX @@ -7687,6 +13766,8 @@ TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX + + // emitCopy CMPL CX, $0x00010000 JL two_byte_offset_standalone_snappy @@ -7756,6 +13837,8 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX MOVQ a_len+8(FP), DX + + // matchLen XORL BP, BP CMPL DX, $0x08 JL matchlen_single_standalone diff --git a/s2/s2_test.go b/s2/s2_test.go index c56d5bc921..b726c9832c 100644 --- a/s2/s2_test.go +++ b/s2/s2_test.go @@ -1330,12 +1330,26 @@ func benchDecode(b *testing.B, src []byte) { func benchEncode(b *testing.B, src []byte) { // Bandwidth is in amount of uncompressed data. - b.SetBytes(int64(len(src))) dst := make([]byte, MaxEncodedLen(len(src))) b.ResetTimer() - for i := 0; i < b.N; i++ { - Encode(dst, src) - } + b.Run("default", func(b *testing.B) { + b.SetBytes(int64(len(src))) + for i := 0; i < b.N; i++ { + Encode(dst, src) + } + }) + b.Run("better", func(b *testing.B) { + b.SetBytes(int64(len(src))) + for i := 0; i < b.N; i++ { + EncodeBetter(dst, src) + } + }) + b.Run("best", func(b *testing.B) { + b.SetBytes(int64(len(src))) + for i := 0; i < b.N; i++ { + EncodeBest(dst, src) + } + }) } func benchEncodeBetter(b *testing.B, src []byte) {