From ad8d514ec4432e23dee90a708d9529e10afb6e5f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 5 Sep 2024 17:38:38 +0000 Subject: [PATCH] build(deps): bump golang.org/x/net from 0.28.0 to 0.29.0 Bumps [golang.org/x/net](https://github.com/golang/net) from 0.28.0 to 0.29.0. - [Commits](https://github.com/golang/net/compare/v0.28.0...v0.29.0) --- updated-dependencies: - dependency-name: golang.org/x/net dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 4 +- go.sum | 8 +- .../golang.org/x/crypto/argon2/blamka_amd64.s | 2972 +++++++++- .../x/crypto/blake2b/blake2bAVX2_amd64.s | 5167 ++++++++++++++--- .../x/crypto/blake2b/blake2b_amd64.s | 1681 +++++- .../x/crypto/internal/poly1305/sum_amd64.s | 133 +- .../x/crypto/salsa20/salsa/salsa20_amd64.s | 1742 +++--- vendor/modules.txt | 4 +- 8 files changed, 9611 insertions(+), 2100 deletions(-) diff --git a/go.mod b/go.mod index f2f1a1cd7f..025b53fb55 100644 --- a/go.mod +++ b/go.mod @@ -52,8 +52,8 @@ require ( github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 github.com/yusufpapurcu/wmi v1.2.4 github.com/zalando/go-keyring v0.2.5 - golang.org/x/crypto v0.26.0 - golang.org/x/net v0.28.0 + golang.org/x/crypto v0.27.0 + golang.org/x/net v0.29.0 golang.org/x/sync v0.8.0 golang.org/x/sys v0.25.0 golang.org/x/term v0.24.0 diff --git a/go.sum b/go.sum index c9fdd51024..471bd96fb0 100644 --- a/go.sum +++ b/go.sum @@ -446,8 +446,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= -golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= @@ -464,8 +464,8 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= -golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s index 6713accac0..c3895478ed 100644 --- a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s +++ b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s @@ -1,243 +1,2791 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blamka_amd64.go -out ../blamka_amd64.s -pkg argon2. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFD $0xB1, v6, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - PSHUFB c40, v2; \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFB c48, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - MOVO v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v7, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - PSHUFB c40, v3; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFB c48, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - MOVO v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG_0(block, off) \ - MOVOU 8*(off+0)(block), X0; \ - MOVOU 8*(off+2)(block), X1; \ - MOVOU 8*(off+4)(block), X2; \ - MOVOU 8*(off+6)(block), X3; \ - MOVOU 8*(off+8)(block), X4; \ - MOVOU 8*(off+10)(block), X5; \ - MOVOU 8*(off+12)(block), X6; \ - MOVOU 8*(off+14)(block), X7 - -#define STORE_MSG_0(block, off) \ - MOVOU X0, 8*(off+0)(block); \ - MOVOU X1, 8*(off+2)(block); \ - MOVOU X2, 8*(off+4)(block); \ - MOVOU X3, 8*(off+6)(block); \ - MOVOU X4, 8*(off+8)(block); \ - MOVOU X5, 8*(off+10)(block); \ - MOVOU X6, 8*(off+12)(block); \ - MOVOU X7, 8*(off+14)(block) - -#define LOAD_MSG_1(block, off) \ - MOVOU 8*off+0*8(block), X0; \ - MOVOU 8*off+16*8(block), X1; \ - MOVOU 8*off+32*8(block), X2; \ - MOVOU 8*off+48*8(block), X3; \ - MOVOU 8*off+64*8(block), X4; \ - MOVOU 8*off+80*8(block), X5; \ - MOVOU 8*off+96*8(block), X6; \ - MOVOU 8*off+112*8(block), X7 - -#define STORE_MSG_1(block, off) \ - MOVOU X0, 8*off+0*8(block); \ - MOVOU X1, 8*off+16*8(block); \ - MOVOU X2, 8*off+32*8(block); \ - MOVOU X3, 8*off+48*8(block); \ - MOVOU X4, 8*off+64*8(block); \ - MOVOU X5, 8*off+80*8(block); \ - MOVOU X6, 8*off+96*8(block); \ - MOVOU X7, 8*off+112*8(block) - -#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ - LOAD_MSG_0(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_0(block, off) - -#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ - LOAD_MSG_1(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_1(block, off) - // func blamkaSSE4(b *block) -TEXT ·blamkaSSE4(SB), 4, $0-8 - MOVQ b+0(FP), AX - - MOVOU ·c40<>(SB), X10 - MOVOU ·c48<>(SB), X11 +// Requires: SSE2, SSSE3 +TEXT ·blamkaSSE4(SB), NOSPLIT, $0-8 + MOVQ b+0(FP), AX + MOVOU ·c40<>+0(SB), X10 + MOVOU ·c48<>+0(SB), X11 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU 128(AX), X0 + MOVOU 144(AX), X1 + MOVOU 160(AX), X2 + MOVOU 176(AX), X3 + MOVOU 192(AX), X4 + MOVOU 208(AX), X5 + MOVOU 224(AX), X6 + MOVOU 240(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 128(AX) + MOVOU X1, 144(AX) + MOVOU X2, 160(AX) + MOVOU X3, 176(AX) + MOVOU X4, 192(AX) + MOVOU X5, 208(AX) + MOVOU X6, 224(AX) + MOVOU X7, 240(AX) + MOVOU 256(AX), X0 + MOVOU 272(AX), X1 + MOVOU 288(AX), X2 + MOVOU 304(AX), X3 + MOVOU 320(AX), X4 + MOVOU 336(AX), X5 + MOVOU 352(AX), X6 + MOVOU 368(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 256(AX) + MOVOU X1, 272(AX) + MOVOU X2, 288(AX) + MOVOU X3, 304(AX) + MOVOU X4, 320(AX) + MOVOU X5, 336(AX) + MOVOU X6, 352(AX) + MOVOU X7, 368(AX) + MOVOU 384(AX), X0 + MOVOU 400(AX), X1 + MOVOU 416(AX), X2 + MOVOU 432(AX), X3 + MOVOU 448(AX), X4 + MOVOU 464(AX), X5 + MOVOU 480(AX), X6 + MOVOU 496(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 384(AX) + MOVOU X1, 400(AX) + MOVOU X2, 416(AX) + MOVOU X3, 432(AX) + MOVOU X4, 448(AX) + MOVOU X5, 464(AX) + MOVOU X6, 480(AX) + MOVOU X7, 496(AX) + MOVOU 512(AX), X0 + MOVOU 528(AX), X1 + MOVOU 544(AX), X2 + MOVOU 560(AX), X3 + MOVOU 576(AX), X4 + MOVOU 592(AX), X5 + MOVOU 608(AX), X6 + MOVOU 624(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 512(AX) + MOVOU X1, 528(AX) + MOVOU X2, 544(AX) + MOVOU X3, 560(AX) + MOVOU X4, 576(AX) + MOVOU X5, 592(AX) + MOVOU X6, 608(AX) + MOVOU X7, 624(AX) + MOVOU 640(AX), X0 + MOVOU 656(AX), X1 + MOVOU 672(AX), X2 + MOVOU 688(AX), X3 + MOVOU 704(AX), X4 + MOVOU 720(AX), X5 + MOVOU 736(AX), X6 + MOVOU 752(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 640(AX) + MOVOU X1, 656(AX) + MOVOU X2, 672(AX) + MOVOU X3, 688(AX) + MOVOU X4, 704(AX) + MOVOU X5, 720(AX) + MOVOU X6, 736(AX) + MOVOU X7, 752(AX) + MOVOU 768(AX), X0 + MOVOU 784(AX), X1 + MOVOU 800(AX), X2 + MOVOU 816(AX), X3 + MOVOU 832(AX), X4 + MOVOU 848(AX), X5 + MOVOU 864(AX), X6 + MOVOU 880(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 768(AX) + MOVOU X1, 784(AX) + MOVOU X2, 800(AX) + MOVOU X3, 816(AX) + MOVOU X4, 832(AX) + MOVOU X5, 848(AX) + MOVOU X6, 864(AX) + MOVOU X7, 880(AX) + MOVOU 896(AX), X0 + MOVOU 912(AX), X1 + MOVOU 928(AX), X2 + MOVOU 944(AX), X3 + MOVOU 960(AX), X4 + MOVOU 976(AX), X5 + MOVOU 992(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 896(AX) + MOVOU X1, 912(AX) + MOVOU X2, 928(AX) + MOVOU X3, 944(AX) + MOVOU X4, 960(AX) + MOVOU X5, 976(AX) + MOVOU X6, 992(AX) + MOVOU X7, 1008(AX) + MOVOU (AX), X0 + MOVOU 128(AX), X1 + MOVOU 256(AX), X2 + MOVOU 384(AX), X3 + MOVOU 512(AX), X4 + MOVOU 640(AX), X5 + MOVOU 768(AX), X6 + MOVOU 896(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 128(AX) + MOVOU X2, 256(AX) + MOVOU X3, 384(AX) + MOVOU X4, 512(AX) + MOVOU X5, 640(AX) + MOVOU X6, 768(AX) + MOVOU X7, 896(AX) + MOVOU 16(AX), X0 + MOVOU 144(AX), X1 + MOVOU 272(AX), X2 + MOVOU 400(AX), X3 + MOVOU 528(AX), X4 + MOVOU 656(AX), X5 + MOVOU 784(AX), X6 + MOVOU 912(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 16(AX) + MOVOU X1, 144(AX) + MOVOU X2, 272(AX) + MOVOU X3, 400(AX) + MOVOU X4, 528(AX) + MOVOU X5, 656(AX) + MOVOU X6, 784(AX) + MOVOU X7, 912(AX) + MOVOU 32(AX), X0 + MOVOU 160(AX), X1 + MOVOU 288(AX), X2 + MOVOU 416(AX), X3 + MOVOU 544(AX), X4 + MOVOU 672(AX), X5 + MOVOU 800(AX), X6 + MOVOU 928(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 32(AX) + MOVOU X1, 160(AX) + MOVOU X2, 288(AX) + MOVOU X3, 416(AX) + MOVOU X4, 544(AX) + MOVOU X5, 672(AX) + MOVOU X6, 800(AX) + MOVOU X7, 928(AX) + MOVOU 48(AX), X0 + MOVOU 176(AX), X1 + MOVOU 304(AX), X2 + MOVOU 432(AX), X3 + MOVOU 560(AX), X4 + MOVOU 688(AX), X5 + MOVOU 816(AX), X6 + MOVOU 944(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 48(AX) + MOVOU X1, 176(AX) + MOVOU X2, 304(AX) + MOVOU X3, 432(AX) + MOVOU X4, 560(AX) + MOVOU X5, 688(AX) + MOVOU X6, 816(AX) + MOVOU X7, 944(AX) + MOVOU 64(AX), X0 + MOVOU 192(AX), X1 + MOVOU 320(AX), X2 + MOVOU 448(AX), X3 + MOVOU 576(AX), X4 + MOVOU 704(AX), X5 + MOVOU 832(AX), X6 + MOVOU 960(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 64(AX) + MOVOU X1, 192(AX) + MOVOU X2, 320(AX) + MOVOU X3, 448(AX) + MOVOU X4, 576(AX) + MOVOU X5, 704(AX) + MOVOU X6, 832(AX) + MOVOU X7, 960(AX) + MOVOU 80(AX), X0 + MOVOU 208(AX), X1 + MOVOU 336(AX), X2 + MOVOU 464(AX), X3 + MOVOU 592(AX), X4 + MOVOU 720(AX), X5 + MOVOU 848(AX), X6 + MOVOU 976(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 80(AX) + MOVOU X1, 208(AX) + MOVOU X2, 336(AX) + MOVOU X3, 464(AX) + MOVOU X4, 592(AX) + MOVOU X5, 720(AX) + MOVOU X6, 848(AX) + MOVOU X7, 976(AX) + MOVOU 96(AX), X0 + MOVOU 224(AX), X1 + MOVOU 352(AX), X2 + MOVOU 480(AX), X3 + MOVOU 608(AX), X4 + MOVOU 736(AX), X5 + MOVOU 864(AX), X6 + MOVOU 992(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 96(AX) + MOVOU X1, 224(AX) + MOVOU X2, 352(AX) + MOVOU X3, 480(AX) + MOVOU X4, 608(AX) + MOVOU X5, 736(AX) + MOVOU X6, 864(AX) + MOVOU X7, 992(AX) + MOVOU 112(AX), X0 + MOVOU 240(AX), X1 + MOVOU 368(AX), X2 + MOVOU 496(AX), X3 + MOVOU 624(AX), X4 + MOVOU 752(AX), X5 + MOVOU 880(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 112(AX) + MOVOU X1, 240(AX) + MOVOU X2, 368(AX) + MOVOU X3, 496(AX) + MOVOU X4, 624(AX) + MOVOU X5, 752(AX) + MOVOU X6, 880(AX) + MOVOU X7, 1008(AX) + RET - BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) - RET +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 -// func mixBlocksSSE2(out, a, b, c *block) -TEXT ·mixBlocksSSE2(SB), 4, $0-32 +// func mixBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·mixBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 PXOR X1, X0 PXOR X2, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET -// func xorBlocksSSE2(out, a, b, c *block) -TEXT ·xorBlocksSSE2(SB), 4, $0-32 +// func xorBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·xorBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 - MOVOU 0(DX), X3 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 + MOVOU (DX), X3 PXOR X1, X0 PXOR X2, X0 PXOR X3, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET diff --git a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s index 9ae8206c20..f75162e039 100644 --- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s +++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s @@ -1,722 +1,4517 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 - -#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 -#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 -#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e -#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 -#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 - -#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ - VPADDQ m0, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m1, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y1_Y1; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y3_Y3; \ - VPADDQ m2, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m3, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y3_Y3; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y1_Y1 - -#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E -#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 -#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E -#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 -#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E - -#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n -#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n -#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n -#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n -#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n - -#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 -#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 -#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 -#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 -#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 - -#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 - -#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 -#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 - -// load msg: Y12 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y12, Y12 - -// load msg: Y13 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ - VMOVQ_SI_X13(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X13(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y13, Y13 - -// load msg: Y14 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ - VMOVQ_SI_X14(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X14(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y14, Y14 - -// load msg: Y15 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ - VMOVQ_SI_X15(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X15(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X11(6*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ - LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ - LOAD_MSG_AVX2_Y15(9, 11, 13, 15) - -#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ - LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ - LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ - VMOVQ_SI_X11(11*8); \ - VPSHUFD $0x4E, 0*8(SI), X14; \ - VPINSRQ_1_SI_X11(5*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(12, 2, 7, 3) - -#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ - VMOVQ_SI_X11(5*8); \ - VMOVDQU 11*8(SI), X12; \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - VMOVQ_SI_X13(8*8); \ - VMOVQ_SI_X11(2*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X11(13*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ - LOAD_MSG_AVX2_Y15(14, 6, 1, 4) - -#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ - LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ - LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ - LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ - VMOVQ_SI_X15(6*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X15(10*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ - LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X13(7*8); \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ - LOAD_MSG_AVX2_Y15(1, 12, 8, 13) - -#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ - LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ - LOAD_MSG_AVX2_Y15(13, 5, 14, 9) - -#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ - LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ - LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ - VMOVQ_SI_X14_0; \ - VPSHUFD $0x4E, 8*8(SI), X11; \ - VPINSRQ_1_SI_X14(6*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(7, 3, 2, 11) - -#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ - LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ - LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ - LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ - VMOVQ_SI_X15_0; \ - VMOVQ_SI_X11(6*8); \ - VPINSRQ_1_SI_X15(4*8); \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ - VMOVQ_SI_X12(6*8); \ - VMOVQ_SI_X11(11*8); \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ - VMOVQ_SI_X11(1*8); \ - VMOVDQU 12*8(SI), X14; \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - VMOVQ_SI_X15(2*8); \ - VMOVDQU 4*8(SI), X11; \ - VPINSRQ_1_SI_X15(7*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ - LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ - VMOVQ_SI_X13(2*8); \ - VPSHUFD $0x4E, 5*8(SI), X11; \ - VPINSRQ_1_SI_X13(4*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ - VMOVQ_SI_X15(11*8); \ - VMOVQ_SI_X11(12*8); \ - VPINSRQ_1_SI_X15(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y15, Y15 - // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, DX - ADDQ $31, DX - ANDQ $~31, DX - - MOVQ CX, 16(DX) - XORQ CX, CX - MOVQ CX, 24(DX) - - VMOVDQU ·AVX2_c40<>(SB), Y4 - VMOVDQU ·AVX2_c48<>(SB), Y5 - - VMOVDQU 0(AX), Y8 +// Requires: AVX, AVX2 +TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, DX + ADDQ $+31, DX + ANDQ $-32, DX + MOVQ CX, 16(DX) + XORQ CX, CX + MOVQ CX, 24(DX) + VMOVDQU ·AVX2_c40<>+0(SB), Y4 + VMOVDQU ·AVX2_c48<>+0(SB), Y5 + VMOVDQU (AX), Y8 VMOVDQU 32(AX), Y9 - VMOVDQU ·AVX2_iv0<>(SB), Y6 - VMOVDQU ·AVX2_iv1<>(SB), Y7 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 - MOVQ R9, 8(DX) + VMOVDQU ·AVX2_iv0<>+0(SB), Y6 + VMOVDQU ·AVX2_iv1<>+0(SB), Y7 + MOVQ (BX), R8 + MOVQ 8(BX), R9 + MOVQ R9, 8(DX) loop: - ADDQ $128, R8 - MOVQ R8, 0(DX) - CMPQ R8, $128 + ADDQ $0x80, R8 + MOVQ R8, (DX) + CMPQ R8, $0x80 JGE noinc INCQ R9 MOVQ R9, 8(DX) noinc: - VMOVDQA Y8, Y0 - VMOVDQA Y9, Y1 - VMOVDQA Y6, Y2 - VPXOR 0(DX), Y7, Y3 - - LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() - VMOVDQA Y12, 32(DX) - VMOVDQA Y13, 64(DX) - VMOVDQA Y14, 96(DX) - VMOVDQA Y15, 128(DX) - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() - VMOVDQA Y12, 160(DX) - VMOVDQA Y13, 192(DX) - VMOVDQA Y14, 224(DX) - VMOVDQA Y15, 256(DX) - - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - - ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) - ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) - - VPXOR Y0, Y8, Y8 - VPXOR Y1, Y9, Y9 - VPXOR Y2, Y8, Y8 - VPXOR Y3, Y9, Y9 - - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop - - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - - VMOVDQU Y8, 0(AX) - VMOVDQU Y9, 32(AX) + VMOVDQA Y8, Y0 + VMOVDQA Y9, Y1 + VMOVDQA Y6, Y2 + VPXOR (DX), Y7, Y3 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x48 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + VPSHUFD $0x4e, (SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x28 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + VMOVDQU 88(SI), X12 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + VPSHUFD $0x4e, 64(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x10 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + VMOVDQU 96(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + VMOVDQU 32(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + VPSHUFD $0x4e, 40(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 32(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 64(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 96(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 128(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 160(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 192(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 224(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 256(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPXOR Y0, Y8, Y8 + VPXOR Y1, Y9, Y9 + VPXOR Y2, Y8, Y8 + VPXOR Y3, Y9, Y9 + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) VZEROUPPER - RET -#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA -#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB -#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF -#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD -#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE - -#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF -#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF - -#define SHUFFLE_AVX() \ - VMOVDQA X6, X13; \ - VMOVDQA X2, X14; \ - VMOVDQA X4, X6; \ - VPUNPCKLQDQ_X13_X13_X15; \ - VMOVDQA X5, X4; \ - VMOVDQA X6, X5; \ - VPUNPCKHQDQ_X15_X7_X6; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X13_X7; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VPUNPCKHQDQ_X15_X2_X2; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X3_X3; \ - -#define SHUFFLE_AVX_INV() \ - VMOVDQA X2, X13; \ - VMOVDQA X4, X14; \ - VPUNPCKLQDQ_X2_X2_X15; \ - VMOVDQA X5, X4; \ - VPUNPCKHQDQ_X15_X3_X2; \ - VMOVDQA X14, X5; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VMOVDQA X6, X14; \ - VPUNPCKHQDQ_X15_X13_X3; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X6_X6; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X7_X7; \ - -#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - VPADDQ m0, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m1, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFD $-79, v6, v6; \ - VPSHUFD $-79, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPSHUFB c40, v2, v2; \ - VPSHUFB c40, v3, v3; \ - VPADDQ m2, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m3, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFB c48, v6, v6; \ - VPSHUFB c48, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPADDQ v2, v2, t0; \ - VPSRLQ $63, v2, v2; \ - VPXOR t0, v2, v2; \ - VPADDQ v3, v3, t0; \ - VPSRLQ $63, v3, v3; \ - VPXOR t0, v3, v3 - -// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) -// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 -#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X13(i2*8); \ - VMOVQ_SI_X14(i4*8); \ - VMOVQ_SI_X15(i6*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X13(i3*8); \ - VPINSRQ_1_SI_X14(i5*8); \ - VPINSRQ_1_SI_X15(i7*8) - -// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) -#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(1*8); \ - VMOVQ_SI_X15(5*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X13(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(7*8) - -// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) -#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ - VPSHUFD $0x4E, 0*8(SI), X12; \ - VMOVQ_SI_X13(11*8); \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(7*8); \ - VPINSRQ_1_SI_X13(5*8); \ - VPINSRQ_1_SI_X14(2*8); \ - VPINSRQ_1_SI_X15(3*8) - -// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) -#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ - VMOVDQU 11*8(SI), X12; \ - VMOVQ_SI_X13(5*8); \ - VMOVQ_SI_X14(8*8); \ - VMOVQ_SI_X15(2*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14_0; \ - VPINSRQ_1_SI_X15(13*8) - -// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) -#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(6*8); \ - VMOVQ_SI_X15_0; \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(8*8) +DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) -#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ - VMOVQ_SI_X12(9*8); \ - VMOVQ_SI_X13(2*8); \ - VMOVQ_SI_X14_0; \ - VMOVQ_SI_X15(4*8); \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VPINSRQ_1_SI_X15(15*8) +DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) -#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(11*8); \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X13(8*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(3*8) +DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) -#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ - MOVQ 0*8(SI), X12; \ - VPSHUFD $0x4E, 8*8(SI), X13; \ - MOVQ 7*8(SI), X14; \ - MOVQ 2*8(SI), X15; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(11*8) - -// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) -#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ - MOVQ 6*8(SI), X12; \ - MOVQ 11*8(SI), X13; \ - MOVQ 15*8(SI), X14; \ - MOVQ 3*8(SI), X15; \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X14(9*8); \ - VPINSRQ_1_SI_X15(8*8) - -// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) -#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ - MOVQ 5*8(SI), X12; \ - MOVQ 8*8(SI), X13; \ - MOVQ 0*8(SI), X14; \ - MOVQ 6*8(SI), X15; \ - VPINSRQ_1_SI_X12(15*8); \ - VPINSRQ_1_SI_X13(2*8); \ - VPINSRQ_1_SI_X14(4*8); \ - VPINSRQ_1_SI_X15(10*8) - -// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) -#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ - VMOVDQU 12*8(SI), X12; \ - MOVQ 1*8(SI), X13; \ - MOVQ 2*8(SI), X14; \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VMOVDQU 4*8(SI), X15 - -// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) -#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ - MOVQ 15*8(SI), X12; \ - MOVQ 3*8(SI), X13; \ - MOVQ 11*8(SI), X14; \ - MOVQ 12*8(SI), X15; \ - VPINSRQ_1_SI_X12(9*8); \ - VPINSRQ_1_SI_X13(13*8); \ - VPINSRQ_1_SI_X14(14*8); \ - VPINSRQ_1_SI_X15_0 +DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f +DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - VMOVDQU ·AVX_c40<>(SB), X0 - VMOVDQU ·AVX_c48<>(SB), X1 +// Requires: AVX, SSE2 +TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + VMOVDQU ·AVX_c40<>+0(SB), X0 + VMOVDQU ·AVX_c48<>+0(SB), X1 VMOVDQA X0, X8 VMOVDQA X1, X9 - - VMOVDQU ·AVX_iv3<>(SB), X0 - VMOVDQA X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) - - VMOVDQU 0(AX), X10 + VMOVDQU ·AVX_iv3<>+0(SB), X0 + VMOVDQA X0, (R10) + XORQ CX, (R10) + VMOVDQU (AX), X10 VMOVDQU 16(AX), X11 VMOVDQU 32(AX), X2 VMOVDQU 48(AX), X3 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - VMOVQ_R8_X15 - VPINSRQ_1_R9_X15 - + BYTE $0xc4 + BYTE $0x41 + BYTE $0xf9 + BYTE $0x6e + BYTE $0xf8 + BYTE $0xc4 + BYTE $0x43 + BYTE $0x81 + BYTE $0x22 + BYTE $0xf9 + BYTE $0x01 VMOVDQA X10, X0 VMOVDQA X11, X1 - VMOVDQU ·AVX_iv0<>(SB), X4 - VMOVDQU ·AVX_iv1<>(SB), X5 - VMOVDQU ·AVX_iv2<>(SB), X6 - + VMOVDQU ·AVX_iv0<>+0(SB), X4 + VMOVDQU ·AVX_iv1<>+0(SB), X5 + VMOVDQU ·AVX_iv2<>+0(SB), X6 VPXOR X15, X6, X6 - VMOVDQA 0(R10), X7 - - LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA (R10), X7 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 VMOVDQA X12, 16(R10) VMOVDQA X13, 32(R10) VMOVDQA X14, 48(R10) VMOVDQA X15, 64(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 VMOVDQA X12, 80(R10) VMOVDQA X13, 96(R10) VMOVDQA X14, 112(R10) VMOVDQA X15, 128(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 VMOVDQA X12, 144(R10) VMOVDQA X13, 160(R10) VMOVDQA X14, 176(R10) VMOVDQA X15, 192(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPSHUFD $0x4e, (SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 VMOVDQA X12, 208(R10) VMOVDQA X13, 224(R10) VMOVDQA X14, 240(R10) VMOVDQA X15, 256(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_11_12_5_15_8_0_2_13() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_2_5_4_15_6_10_0_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_9_5_2_10_0_7_4_15() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_2_6_0_8_12_10_11_3() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_0_6_9_8_7_3_2_11() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_5_15_8_2_0_4_6_10() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_6_14_11_0_15_9_3_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_12_13_1_10_2_7_4_5() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_15_9_3_13_11_14_12_0() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VMOVDQU 88(SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x36 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ (SI), X12 + VPSHUFD $0x4e, 64(SI), X13 + MOVQ 56(SI), X14 + MOVQ 16(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 40(SI), X12 + MOVQ 64(SI), X13 + MOVQ (SI), X14 + MOVQ 48(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + MOVQ 48(SI), X12 + MOVQ 88(SI), X13 + MOVQ 120(SI), X14 + MOVQ 24(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VMOVDQU 96(SI), X12 + MOVQ 8(SI), X13 + MOVQ 16(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + VMOVDQU 32(SI), X15 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 120(SI), X12 + MOVQ 24(SI), X13 + MOVQ 88(SI), X14 + MOVQ 96(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x3e + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 16(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 32(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 48(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 64(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 80(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 96(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 112(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 128(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 144(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 160(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 176(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 192(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 208(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 224(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 240(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 256(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff VMOVDQU 32(AX), X14 VMOVDQU 48(AX), X15 VPXOR X0, X10, X10 @@ -729,16 +4524,36 @@ noinc: VPXOR X7, X15, X3 VMOVDQU X2, 32(AX) VMOVDQU X3, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + VMOVDQU X10, (AX) + VMOVDQU X11, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VZEROUPPER + RET - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16 - VMOVDQU X10, 0(AX) - VMOVDQU X11, 16(AX) +DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - VZEROUPPER +DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16 - RET +DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16 diff --git a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s index adfac00c15..9a0ce21244 100644 --- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s +++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s @@ -1,278 +1,1441 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - PADDQ m0, v0; \ - PADDQ m1, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v6, v6; \ - PSHUFD $0xB1, v7, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - PSHUFB c40, v2; \ - PSHUFB c40, v3; \ - PADDQ m2, v0; \ - PADDQ m3, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFB c48, v6; \ - PSHUFB c48, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - MOVOU v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVOU v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ - MOVQ i0*8(src), m0; \ - PINSRQ $1, i1*8(src), m0; \ - MOVQ i2*8(src), m1; \ - PINSRQ $1, i3*8(src), m1; \ - MOVQ i4*8(src), m2; \ - PINSRQ $1, i5*8(src), m2; \ - MOVQ i6*8(src), m3; \ - PINSRQ $1, i7*8(src), m3 - // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) - - MOVOU ·c40<>(SB), X13 - MOVOU ·c48<>(SB), X14 - - MOVOU 0(AX), X12 +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + MOVOU ·iv3<>+0(SB), X0 + MOVO X0, (R10) + XORQ CX, (R10) + MOVOU ·c40<>+0(SB), X13 + MOVOU ·c48<>+0(SB), X14 + MOVOU (AX), X12 MOVOU 16(AX), X15 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - MOVQ R8, X8 - PINSRQ $1, R9, X8 - - MOVO X12, X0 - MOVO X15, X1 - MOVOU 32(AX), X2 - MOVOU 48(AX), X3 - MOVOU ·iv0<>(SB), X4 - MOVOU ·iv1<>(SB), X5 - MOVOU ·iv2<>(SB), X6 - - PXOR X8, X6 - MOVO 0(R10), X7 - - LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(R10) - MOVO X9, 32(R10) - MOVO X10, 48(R10) - MOVO X11, 64(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(R10) - MOVO X9, 96(R10) - MOVO X10, 112(R10) - MOVO X11, 128(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(R10) - MOVO X9, 160(R10) - MOVO X10, 176(R10) - MOVO X11, 192(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(R10) - MOVO X9, 224(R10) - MOVO X10, 240(R10) - MOVO X11, 256(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + MOVQ R8, X8 + PINSRQ $0x01, R9, X8 + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>+0(SB), X4 + MOVOU ·iv1<>+0(SB), X5 + MOVOU ·iv2<>+0(SB), X6 + PXOR X8, X6 + MOVO (R10), X7 + MOVQ (SI), X8 + PINSRQ $0x01, 16(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 48(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 40(SI), X11 + PINSRQ $0x01, 56(SI), X11 + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 64(SI), X8 + PINSRQ $0x01, 80(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 112(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 88(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 120(SI), X11 + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 112(SI), X8 + PINSRQ $0x01, 32(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 80(SI), X10 + PINSRQ $0x01, 64(SI), X10 + MOVQ 120(SI), X11 + PINSRQ $0x01, 48(SI), X11 + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 8(SI), X8 + PINSRQ $0x01, (SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, 40(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 16(SI), X10 + MOVQ 56(SI), X11 + PINSRQ $0x01, 24(SI), X11 + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 88(SI), X8 + PINSRQ $0x01, 96(SI), X8 + MOVQ 40(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 64(SI), X10 + PINSRQ $0x01, (SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 80(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 72(SI), X9 + MOVQ 112(SI), X10 + PINSRQ $0x01, 48(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 32(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 56(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 104(SI), X9 + PINSRQ $0x01, 88(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 8(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, 112(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 16(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 48(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ (SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 72(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 16(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 120(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 112(SI), X8 + PINSRQ $0x01, 88(SI), X8 + MOVQ 48(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 96(SI), X10 + MOVQ 64(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 16(SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ (SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ 88(SI), X11 + PINSRQ $0x01, 24(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 32(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 120(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 104(SI), X10 + PINSRQ $0x01, 40(SI), X10 + MOVQ 112(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 96(SI), X8 + PINSRQ $0x01, 8(SI), X8 + MOVQ 112(SI), X9 + PINSRQ $0x01, 32(SI), X9 + MOVQ 40(SI), X10 + PINSRQ $0x01, 120(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ (SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 56(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 88(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 104(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 40(SI), X8 + PINSRQ $0x01, 120(SI), X8 + MOVQ 64(SI), X9 + PINSRQ $0x01, 16(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 48(SI), X8 + PINSRQ $0x01, 112(SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, (SI), X9 + MOVQ 120(SI), X10 + PINSRQ $0x01, 72(SI), X10 + MOVQ 24(SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 96(SI), X8 + PINSRQ $0x01, 104(SI), X8 + MOVQ 8(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 80(SI), X8 + PINSRQ $0x01, 64(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 120(SI), X8 + PINSRQ $0x01, 72(SI), X8 + MOVQ 24(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, (SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 16(R10), X0 + PADDQ 32(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 48(R10), X0 + PADDQ 64(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 80(R10), X0 + PADDQ 96(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 112(R10), X0 + PADDQ 128(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 144(R10), X0 + PADDQ 160(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 176(R10), X0 + PADDQ 192(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 208(R10), X0 + PADDQ 224(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 240(R10), X0 + PADDQ 256(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVOU X12, (AX) + MOVOU X15, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + RET - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) +DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), RODATA|NOPTR, $16 - MOVOU 32(AX), X10 - MOVOU 48(AX), X11 - PXOR X0, X12 - PXOR X1, X15 - PXOR X2, X10 - PXOR X3, X11 - PXOR X4, X12 - PXOR X5, X15 - PXOR X6, X10 - PXOR X7, X11 - MOVOU X10, 32(AX) - MOVOU X11, 48(AX) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 - MOVOU X12, 0(AX) - MOVOU X15, 16(AX) +DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) +DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 - RET +DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), RODATA|NOPTR, $16 diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s index e0d3c64756..133757384b 100644 --- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s index fcce0234b6..3883e0ec22 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s +++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s @@ -1,880 +1,880 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html +// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte) +// Requires: SSE2 +TEXT ·salsa2020XORKeyStream(SB), $456-40 + // This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. + MOVQ out+0(FP), DI + MOVQ in+8(FP), SI + MOVQ n+16(FP), DX + MOVQ nonce+24(FP), CX + MOVQ key+32(FP), R8 + MOVQ SP, R12 + ADDQ $0x1f, R12 + ANDQ $-32, R12 + MOVQ DX, R9 + MOVQ CX, DX + MOVQ R8, R10 + CMPQ R9, $0x00 + JBE DONE + MOVL 20(R10), CX + MOVL (R10), R8 + MOVL (DX), AX + MOVL 16(R10), R11 + MOVL CX, (R12) + MOVL R8, 4(R12) + MOVL AX, 8(R12) + MOVL R11, 12(R12) + MOVL 8(DX), CX + MOVL 24(R10), R8 + MOVL 4(R10), AX + MOVL 4(DX), R11 + MOVL CX, 16(R12) + MOVL R8, 20(R12) + MOVL AX, 24(R12) + MOVL R11, 28(R12) + MOVL 12(DX), CX + MOVL 12(R10), DX + MOVL 28(R10), R8 + MOVL 8(R10), AX + MOVL DX, 32(R12) + MOVL CX, 36(R12) + MOVL R8, 40(R12) + MOVL AX, 44(R12) + MOVQ $0x61707865, DX + MOVQ $0x3320646e, CX + MOVQ $0x79622d32, R8 + MOVQ $0x6b206574, AX + MOVL DX, 48(R12) + MOVL CX, 52(R12) + MOVL R8, 56(R12) + MOVL AX, 60(R12) + CMPQ R9, $0x00000100 + JB BYTESBETWEEN1AND255 + MOVOA 48(R12), X0 + PSHUFL $0x55, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X3 + PSHUFL $0x00, X0, X0 + MOVOA X1, 64(R12) + MOVOA X2, 80(R12) + MOVOA X3, 96(R12) + MOVOA X0, 112(R12) + MOVOA (R12), X0 + PSHUFL $0xaa, X0, X1 + PSHUFL $0xff, X0, X2 + PSHUFL $0x00, X0, X3 + PSHUFL $0x55, X0, X0 + MOVOA X1, 128(R12) + MOVOA X2, 144(R12) + MOVOA X3, 160(R12) + MOVOA X0, 176(R12) + MOVOA 16(R12), X0 + PSHUFL $0xff, X0, X1 + PSHUFL $0x55, X0, X2 + PSHUFL $0xaa, X0, X0 + MOVOA X1, 192(R12) + MOVOA X2, 208(R12) + MOVOA X0, 224(R12) + MOVOA 32(R12), X0 + PSHUFL $0x00, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X0 + MOVOA X1, 240(R12) + MOVOA X2, 256(R12) + MOVOA X0, 272(R12) -// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) -// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. -TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment - MOVQ out+0(FP),DI - MOVQ in+8(FP),SI - MOVQ n+16(FP),DX - MOVQ nonce+24(FP),CX - MOVQ key+32(FP),R8 +BYTESATLEAST256: + MOVL 16(R12), DX + MOVL 36(R12), CX + MOVL DX, 288(R12) + MOVL CX, 304(R12) + SHLQ $0x20, CX + ADDQ CX, DX + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 292(R12) + MOVL CX, 308(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 296(R12) + MOVL CX, 312(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 300(R12) + MOVL CX, 316(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 16(R12) + MOVL CX, 36(R12) + MOVQ R9, 352(R12) + MOVQ $0x00000014, DX + MOVOA 64(R12), X0 + MOVOA 80(R12), X1 + MOVOA 96(R12), X2 + MOVOA 256(R12), X3 + MOVOA 272(R12), X4 + MOVOA 128(R12), X5 + MOVOA 144(R12), X6 + MOVOA 176(R12), X7 + MOVOA 192(R12), X8 + MOVOA 208(R12), X9 + MOVOA 224(R12), X10 + MOVOA 304(R12), X11 + MOVOA 112(R12), X12 + MOVOA 160(R12), X13 + MOVOA 240(R12), X14 + MOVOA 288(R12), X15 - MOVQ SP,R12 - ADDQ $31, R12 - ANDQ $~31, R12 +MAINLOOP1: + MOVOA X1, 320(R12) + MOVOA X2, 336(R12) + MOVOA X13, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X14 + PSRLL $0x19, X2 + PXOR X2, X14 + MOVOA X7, X1 + PADDL X0, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X11 + PSRLL $0x19, X2 + PXOR X2, X11 + MOVOA X12, X1 + PADDL X14, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X15 + PSRLL $0x17, X2 + PXOR X2, X15 + MOVOA X0, X1 + PADDL X11, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X9 + PSRLL $0x17, X2 + PXOR X2, X9 + MOVOA X14, X1 + PADDL X15, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X13 + PSRLL $0x13, X2 + PXOR X2, X13 + MOVOA X11, X1 + PADDL X9, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X7 + PSRLL $0x13, X2 + PXOR X2, X7 + MOVOA X15, X1 + PADDL X13, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA 320(R12), X1 + MOVOA X12, 320(R12) + MOVOA X9, X2 + PADDL X7, X2 + MOVOA X2, X12 + PSLLL $0x12, X2 + PXOR X2, X0 + PSRLL $0x0e, X12 + PXOR X12, X0 + MOVOA X5, X2 + PADDL X1, X2 + MOVOA X2, X12 + PSLLL $0x07, X2 + PXOR X2, X3 + PSRLL $0x19, X12 + PXOR X12, X3 + MOVOA 336(R12), X2 + MOVOA X0, 336(R12) + MOVOA X6, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X4 + PSRLL $0x19, X12 + PXOR X12, X4 + MOVOA X1, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X10 + PSRLL $0x17, X12 + PXOR X12, X10 + MOVOA X2, X0 + PADDL X4, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X8 + PSRLL $0x17, X12 + PXOR X12, X8 + MOVOA X3, X0 + PADDL X10, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X5 + PSRLL $0x13, X12 + PXOR X12, X5 + MOVOA X4, X0 + PADDL X8, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X6 + PSRLL $0x13, X12 + PXOR X12, X6 + MOVOA X10, X0 + PADDL X5, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA 320(R12), X0 + MOVOA X1, 320(R12) + MOVOA X4, X1 + PADDL X0, X1 + MOVOA X1, X12 + PSLLL $0x07, X1 + PXOR X1, X7 + PSRLL $0x19, X12 + PXOR X12, X7 + MOVOA X8, X1 + PADDL X6, X1 + MOVOA X1, X12 + PSLLL $0x12, X1 + PXOR X1, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 336(R12), X12 + MOVOA X2, 336(R12) + MOVOA X14, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X5 + PSRLL $0x19, X2 + PXOR X2, X5 + MOVOA X0, X1 + PADDL X7, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X10 + PSRLL $0x17, X2 + PXOR X2, X10 + MOVOA X12, X1 + PADDL X5, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X8 + PSRLL $0x17, X2 + PXOR X2, X8 + MOVOA X7, X1 + PADDL X10, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X4 + PSRLL $0x13, X2 + PXOR X2, X4 + MOVOA X5, X1 + PADDL X8, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X14 + PSRLL $0x13, X2 + PXOR X2, X14 + MOVOA X10, X1 + PADDL X4, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X0 + PSRLL $0x0e, X2 + PXOR X2, X0 + MOVOA 320(R12), X1 + MOVOA X0, 320(R12) + MOVOA X8, X0 + PADDL X14, X0 + MOVOA X0, X2 + PSLLL $0x12, X0 + PXOR X0, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA X11, X0 + PADDL X1, X0 + MOVOA X0, X2 + PSLLL $0x07, X0 + PXOR X0, X6 + PSRLL $0x19, X2 + PXOR X2, X6 + MOVOA 336(R12), X2 + MOVOA X12, 336(R12) + MOVOA X3, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X13 + PSRLL $0x19, X12 + PXOR X12, X13 + MOVOA X1, X0 + PADDL X6, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X15 + PSRLL $0x17, X12 + PXOR X12, X15 + MOVOA X2, X0 + PADDL X13, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X9 + PSRLL $0x17, X12 + PXOR X12, X9 + MOVOA X6, X0 + PADDL X15, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X11 + PSRLL $0x13, X12 + PXOR X12, X11 + MOVOA X13, X0 + PADDL X9, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X3 + PSRLL $0x13, X12 + PXOR X12, X3 + MOVOA X15, X0 + PADDL X11, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA X9, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 320(R12), X12 + MOVOA 336(R12), X0 + SUBQ $0x02, DX + JA MAINLOOP1 + PADDL 112(R12), X12 + PADDL 176(R12), X7 + PADDL 224(R12), X10 + PADDL 272(R12), X4 + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL (SI), DX + XORL 4(SI), CX + XORL 8(SI), R8 + XORL 12(SI), R9 + MOVL DX, (DI) + MOVL CX, 4(DI) + MOVL R8, 8(DI) + MOVL R9, 12(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 64(SI), DX + XORL 68(SI), CX + XORL 72(SI), R8 + XORL 76(SI), R9 + MOVL DX, 64(DI) + MOVL CX, 68(DI) + MOVL R8, 72(DI) + MOVL R9, 76(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 128(SI), DX + XORL 132(SI), CX + XORL 136(SI), R8 + XORL 140(SI), R9 + MOVL DX, 128(DI) + MOVL CX, 132(DI) + MOVL R8, 136(DI) + MOVL R9, 140(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + XORL 192(SI), DX + XORL 196(SI), CX + XORL 200(SI), R8 + XORL 204(SI), R9 + MOVL DX, 192(DI) + MOVL CX, 196(DI) + MOVL R8, 200(DI) + MOVL R9, 204(DI) + PADDL 240(R12), X14 + PADDL 64(R12), X0 + PADDL 128(R12), X5 + PADDL 192(R12), X8 + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 16(SI), DX + XORL 20(SI), CX + XORL 24(SI), R8 + XORL 28(SI), R9 + MOVL DX, 16(DI) + MOVL CX, 20(DI) + MOVL R8, 24(DI) + MOVL R9, 28(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 80(SI), DX + XORL 84(SI), CX + XORL 88(SI), R8 + XORL 92(SI), R9 + MOVL DX, 80(DI) + MOVL CX, 84(DI) + MOVL R8, 88(DI) + MOVL R9, 92(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 144(SI), DX + XORL 148(SI), CX + XORL 152(SI), R8 + XORL 156(SI), R9 + MOVL DX, 144(DI) + MOVL CX, 148(DI) + MOVL R8, 152(DI) + MOVL R9, 156(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + XORL 208(SI), DX + XORL 212(SI), CX + XORL 216(SI), R8 + XORL 220(SI), R9 + MOVL DX, 208(DI) + MOVL CX, 212(DI) + MOVL R8, 216(DI) + MOVL R9, 220(DI) + PADDL 288(R12), X15 + PADDL 304(R12), X11 + PADDL 80(R12), X1 + PADDL 144(R12), X6 + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 32(SI), DX + XORL 36(SI), CX + XORL 40(SI), R8 + XORL 44(SI), R9 + MOVL DX, 32(DI) + MOVL CX, 36(DI) + MOVL R8, 40(DI) + MOVL R9, 44(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 96(SI), DX + XORL 100(SI), CX + XORL 104(SI), R8 + XORL 108(SI), R9 + MOVL DX, 96(DI) + MOVL CX, 100(DI) + MOVL R8, 104(DI) + MOVL R9, 108(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 160(SI), DX + XORL 164(SI), CX + XORL 168(SI), R8 + XORL 172(SI), R9 + MOVL DX, 160(DI) + MOVL CX, 164(DI) + MOVL R8, 168(DI) + MOVL R9, 172(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + XORL 224(SI), DX + XORL 228(SI), CX + XORL 232(SI), R8 + XORL 236(SI), R9 + MOVL DX, 224(DI) + MOVL CX, 228(DI) + MOVL R8, 232(DI) + MOVL R9, 236(DI) + PADDL 160(R12), X13 + PADDL 208(R12), X9 + PADDL 256(R12), X3 + PADDL 96(R12), X2 + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 48(SI), DX + XORL 52(SI), CX + XORL 56(SI), R8 + XORL 60(SI), R9 + MOVL DX, 48(DI) + MOVL CX, 52(DI) + MOVL R8, 56(DI) + MOVL R9, 60(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 112(SI), DX + XORL 116(SI), CX + XORL 120(SI), R8 + XORL 124(SI), R9 + MOVL DX, 112(DI) + MOVL CX, 116(DI) + MOVL R8, 120(DI) + MOVL R9, 124(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 176(SI), DX + XORL 180(SI), CX + XORL 184(SI), R8 + XORL 188(SI), R9 + MOVL DX, 176(DI) + MOVL CX, 180(DI) + MOVL R8, 184(DI) + MOVL R9, 188(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + XORL 240(SI), DX + XORL 244(SI), CX + XORL 248(SI), R8 + XORL 252(SI), R9 + MOVL DX, 240(DI) + MOVL CX, 244(DI) + MOVL R8, 248(DI) + MOVL R9, 252(DI) + MOVQ 352(R12), R9 + SUBQ $0x00000100, R9 + ADDQ $0x00000100, SI + ADDQ $0x00000100, DI + CMPQ R9, $0x00000100 + JAE BYTESATLEAST256 + CMPQ R9, $0x00 + JBE DONE - MOVQ DX,R9 - MOVQ CX,DX - MOVQ R8,R10 - CMPQ R9,$0 - JBE DONE - START: - MOVL 20(R10),CX - MOVL 0(R10),R8 - MOVL 0(DX),AX - MOVL 16(R10),R11 - MOVL CX,0(R12) - MOVL R8, 4 (R12) - MOVL AX, 8 (R12) - MOVL R11, 12 (R12) - MOVL 8(DX),CX - MOVL 24(R10),R8 - MOVL 4(R10),AX - MOVL 4(DX),R11 - MOVL CX,16(R12) - MOVL R8, 20 (R12) - MOVL AX, 24 (R12) - MOVL R11, 28 (R12) - MOVL 12(DX),CX - MOVL 12(R10),DX - MOVL 28(R10),R8 - MOVL 8(R10),AX - MOVL DX,32(R12) - MOVL CX, 36 (R12) - MOVL R8, 40 (R12) - MOVL AX, 44 (R12) - MOVQ $1634760805,DX - MOVQ $857760878,CX - MOVQ $2036477234,R8 - MOVQ $1797285236,AX - MOVL DX,48(R12) - MOVL CX, 52 (R12) - MOVL R8, 56 (R12) - MOVL AX, 60 (R12) - CMPQ R9,$256 - JB BYTESBETWEEN1AND255 - MOVOA 48(R12),X0 - PSHUFL $0X55,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X3 - PSHUFL $0X00,X0,X0 - MOVOA X1,64(R12) - MOVOA X2,80(R12) - MOVOA X3,96(R12) - MOVOA X0,112(R12) - MOVOA 0(R12),X0 - PSHUFL $0XAA,X0,X1 - PSHUFL $0XFF,X0,X2 - PSHUFL $0X00,X0,X3 - PSHUFL $0X55,X0,X0 - MOVOA X1,128(R12) - MOVOA X2,144(R12) - MOVOA X3,160(R12) - MOVOA X0,176(R12) - MOVOA 16(R12),X0 - PSHUFL $0XFF,X0,X1 - PSHUFL $0X55,X0,X2 - PSHUFL $0XAA,X0,X0 - MOVOA X1,192(R12) - MOVOA X2,208(R12) - MOVOA X0,224(R12) - MOVOA 32(R12),X0 - PSHUFL $0X00,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X0 - MOVOA X1,240(R12) - MOVOA X2,256(R12) - MOVOA X0,272(R12) - BYTESATLEAST256: - MOVL 16(R12),DX - MOVL 36 (R12),CX - MOVL DX,288(R12) - MOVL CX,304(R12) - SHLQ $32,CX - ADDQ CX,DX - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 292 (R12) - MOVL CX, 308 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 296 (R12) - MOVL CX, 312 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 300 (R12) - MOVL CX, 316 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX,16(R12) - MOVL CX, 36 (R12) - MOVQ R9,352(R12) - MOVQ $20,DX - MOVOA 64(R12),X0 - MOVOA 80(R12),X1 - MOVOA 96(R12),X2 - MOVOA 256(R12),X3 - MOVOA 272(R12),X4 - MOVOA 128(R12),X5 - MOVOA 144(R12),X6 - MOVOA 176(R12),X7 - MOVOA 192(R12),X8 - MOVOA 208(R12),X9 - MOVOA 224(R12),X10 - MOVOA 304(R12),X11 - MOVOA 112(R12),X12 - MOVOA 160(R12),X13 - MOVOA 240(R12),X14 - MOVOA 288(R12),X15 - MAINLOOP1: - MOVOA X1,320(R12) - MOVOA X2,336(R12) - MOVOA X13,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X14 - PSRLL $25,X2 - PXOR X2,X14 - MOVOA X7,X1 - PADDL X0,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X11 - PSRLL $25,X2 - PXOR X2,X11 - MOVOA X12,X1 - PADDL X14,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X15 - PSRLL $23,X2 - PXOR X2,X15 - MOVOA X0,X1 - PADDL X11,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X9 - PSRLL $23,X2 - PXOR X2,X9 - MOVOA X14,X1 - PADDL X15,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X13 - PSRLL $19,X2 - PXOR X2,X13 - MOVOA X11,X1 - PADDL X9,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X7 - PSRLL $19,X2 - PXOR X2,X7 - MOVOA X15,X1 - PADDL X13,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA 320(R12),X1 - MOVOA X12,320(R12) - MOVOA X9,X2 - PADDL X7,X2 - MOVOA X2,X12 - PSLLL $18,X2 - PXOR X2,X0 - PSRLL $14,X12 - PXOR X12,X0 - MOVOA X5,X2 - PADDL X1,X2 - MOVOA X2,X12 - PSLLL $7,X2 - PXOR X2,X3 - PSRLL $25,X12 - PXOR X12,X3 - MOVOA 336(R12),X2 - MOVOA X0,336(R12) - MOVOA X6,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X4 - PSRLL $25,X12 - PXOR X12,X4 - MOVOA X1,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X10 - PSRLL $23,X12 - PXOR X12,X10 - MOVOA X2,X0 - PADDL X4,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X8 - PSRLL $23,X12 - PXOR X12,X8 - MOVOA X3,X0 - PADDL X10,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X5 - PSRLL $19,X12 - PXOR X12,X5 - MOVOA X4,X0 - PADDL X8,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X6 - PSRLL $19,X12 - PXOR X12,X6 - MOVOA X10,X0 - PADDL X5,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA 320(R12),X0 - MOVOA X1,320(R12) - MOVOA X4,X1 - PADDL X0,X1 - MOVOA X1,X12 - PSLLL $7,X1 - PXOR X1,X7 - PSRLL $25,X12 - PXOR X12,X7 - MOVOA X8,X1 - PADDL X6,X1 - MOVOA X1,X12 - PSLLL $18,X1 - PXOR X1,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 336(R12),X12 - MOVOA X2,336(R12) - MOVOA X14,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X5 - PSRLL $25,X2 - PXOR X2,X5 - MOVOA X0,X1 - PADDL X7,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X10 - PSRLL $23,X2 - PXOR X2,X10 - MOVOA X12,X1 - PADDL X5,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X8 - PSRLL $23,X2 - PXOR X2,X8 - MOVOA X7,X1 - PADDL X10,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X4 - PSRLL $19,X2 - PXOR X2,X4 - MOVOA X5,X1 - PADDL X8,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X14 - PSRLL $19,X2 - PXOR X2,X14 - MOVOA X10,X1 - PADDL X4,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X0 - PSRLL $14,X2 - PXOR X2,X0 - MOVOA 320(R12),X1 - MOVOA X0,320(R12) - MOVOA X8,X0 - PADDL X14,X0 - MOVOA X0,X2 - PSLLL $18,X0 - PXOR X0,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA X11,X0 - PADDL X1,X0 - MOVOA X0,X2 - PSLLL $7,X0 - PXOR X0,X6 - PSRLL $25,X2 - PXOR X2,X6 - MOVOA 336(R12),X2 - MOVOA X12,336(R12) - MOVOA X3,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X13 - PSRLL $25,X12 - PXOR X12,X13 - MOVOA X1,X0 - PADDL X6,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X15 - PSRLL $23,X12 - PXOR X12,X15 - MOVOA X2,X0 - PADDL X13,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X9 - PSRLL $23,X12 - PXOR X12,X9 - MOVOA X6,X0 - PADDL X15,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X11 - PSRLL $19,X12 - PXOR X12,X11 - MOVOA X13,X0 - PADDL X9,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X3 - PSRLL $19,X12 - PXOR X12,X3 - MOVOA X15,X0 - PADDL X11,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA X9,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 320(R12),X12 - MOVOA 336(R12),X0 - SUBQ $2,DX - JA MAINLOOP1 - PADDL 112(R12),X12 - PADDL 176(R12),X7 - PADDL 224(R12),X10 - PADDL 272(R12),X4 - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 0(SI),DX - XORL 4(SI),CX - XORL 8(SI),R8 - XORL 12(SI),R9 - MOVL DX,0(DI) - MOVL CX,4(DI) - MOVL R8,8(DI) - MOVL R9,12(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 64(SI),DX - XORL 68(SI),CX - XORL 72(SI),R8 - XORL 76(SI),R9 - MOVL DX,64(DI) - MOVL CX,68(DI) - MOVL R8,72(DI) - MOVL R9,76(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 128(SI),DX - XORL 132(SI),CX - XORL 136(SI),R8 - XORL 140(SI),R9 - MOVL DX,128(DI) - MOVL CX,132(DI) - MOVL R8,136(DI) - MOVL R9,140(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - XORL 192(SI),DX - XORL 196(SI),CX - XORL 200(SI),R8 - XORL 204(SI),R9 - MOVL DX,192(DI) - MOVL CX,196(DI) - MOVL R8,200(DI) - MOVL R9,204(DI) - PADDL 240(R12),X14 - PADDL 64(R12),X0 - PADDL 128(R12),X5 - PADDL 192(R12),X8 - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 16(SI),DX - XORL 20(SI),CX - XORL 24(SI),R8 - XORL 28(SI),R9 - MOVL DX,16(DI) - MOVL CX,20(DI) - MOVL R8,24(DI) - MOVL R9,28(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 80(SI),DX - XORL 84(SI),CX - XORL 88(SI),R8 - XORL 92(SI),R9 - MOVL DX,80(DI) - MOVL CX,84(DI) - MOVL R8,88(DI) - MOVL R9,92(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 144(SI),DX - XORL 148(SI),CX - XORL 152(SI),R8 - XORL 156(SI),R9 - MOVL DX,144(DI) - MOVL CX,148(DI) - MOVL R8,152(DI) - MOVL R9,156(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - XORL 208(SI),DX - XORL 212(SI),CX - XORL 216(SI),R8 - XORL 220(SI),R9 - MOVL DX,208(DI) - MOVL CX,212(DI) - MOVL R8,216(DI) - MOVL R9,220(DI) - PADDL 288(R12),X15 - PADDL 304(R12),X11 - PADDL 80(R12),X1 - PADDL 144(R12),X6 - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 32(SI),DX - XORL 36(SI),CX - XORL 40(SI),R8 - XORL 44(SI),R9 - MOVL DX,32(DI) - MOVL CX,36(DI) - MOVL R8,40(DI) - MOVL R9,44(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 96(SI),DX - XORL 100(SI),CX - XORL 104(SI),R8 - XORL 108(SI),R9 - MOVL DX,96(DI) - MOVL CX,100(DI) - MOVL R8,104(DI) - MOVL R9,108(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 160(SI),DX - XORL 164(SI),CX - XORL 168(SI),R8 - XORL 172(SI),R9 - MOVL DX,160(DI) - MOVL CX,164(DI) - MOVL R8,168(DI) - MOVL R9,172(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - XORL 224(SI),DX - XORL 228(SI),CX - XORL 232(SI),R8 - XORL 236(SI),R9 - MOVL DX,224(DI) - MOVL CX,228(DI) - MOVL R8,232(DI) - MOVL R9,236(DI) - PADDL 160(R12),X13 - PADDL 208(R12),X9 - PADDL 256(R12),X3 - PADDL 96(R12),X2 - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 48(SI),DX - XORL 52(SI),CX - XORL 56(SI),R8 - XORL 60(SI),R9 - MOVL DX,48(DI) - MOVL CX,52(DI) - MOVL R8,56(DI) - MOVL R9,60(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 112(SI),DX - XORL 116(SI),CX - XORL 120(SI),R8 - XORL 124(SI),R9 - MOVL DX,112(DI) - MOVL CX,116(DI) - MOVL R8,120(DI) - MOVL R9,124(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 176(SI),DX - XORL 180(SI),CX - XORL 184(SI),R8 - XORL 188(SI),R9 - MOVL DX,176(DI) - MOVL CX,180(DI) - MOVL R8,184(DI) - MOVL R9,188(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - XORL 240(SI),DX - XORL 244(SI),CX - XORL 248(SI),R8 - XORL 252(SI),R9 - MOVL DX,240(DI) - MOVL CX,244(DI) - MOVL R8,248(DI) - MOVL R9,252(DI) - MOVQ 352(R12),R9 - SUBQ $256,R9 - ADDQ $256,SI - ADDQ $256,DI - CMPQ R9,$256 - JAE BYTESATLEAST256 - CMPQ R9,$0 - JBE DONE - BYTESBETWEEN1AND255: - CMPQ R9,$64 - JAE NOCOPY - MOVQ DI,DX - LEAQ 360(R12),DI - MOVQ R9,CX +BYTESBETWEEN1AND255: + CMPQ R9, $0x40 + JAE NOCOPY + MOVQ DI, DX + LEAQ 360(R12), DI + MOVQ R9, CX REP; MOVSB - LEAQ 360(R12),DI - LEAQ 360(R12),SI - NOCOPY: - MOVQ R9,352(R12) - MOVOA 48(R12),X0 - MOVOA 0(R12),X1 - MOVOA 16(R12),X2 - MOVOA 32(R12),X3 - MOVOA X1,X4 - MOVQ $20,CX - MAINLOOP2: - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - SUBQ $4,CX - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PXOR X7,X7 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - JA MAINLOOP2 - PADDL 48(R12),X0 - PADDL 0(R12),X1 - PADDL 16(R12),X2 - PADDL 32(R12),X3 - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 0(SI),CX - XORL 48(SI),R8 - XORL 32(SI),R9 - XORL 16(SI),AX - MOVL CX,0(DI) - MOVL R8,48(DI) - MOVL R9,32(DI) - MOVL AX,16(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 20(SI),CX - XORL 4(SI),R8 - XORL 52(SI),R9 - XORL 36(SI),AX - MOVL CX,20(DI) - MOVL R8,4(DI) - MOVL R9,52(DI) - MOVL AX,36(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 40(SI),CX - XORL 24(SI),R8 - XORL 8(SI),R9 - XORL 56(SI),AX - MOVL CX,40(DI) - MOVL R8,24(DI) - MOVL R9,8(DI) - MOVL AX,56(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - XORL 60(SI),CX - XORL 44(SI),R8 - XORL 28(SI),R9 - XORL 12(SI),AX - MOVL CX,60(DI) - MOVL R8,44(DI) - MOVL R9,28(DI) - MOVL AX,12(DI) - MOVQ 352(R12),R9 - MOVL 16(R12),CX - MOVL 36 (R12),R8 - ADDQ $1,CX - SHLQ $32,R8 - ADDQ R8,CX - MOVQ CX,R8 - SHRQ $32,R8 - MOVL CX,16(R12) - MOVL R8, 36 (R12) - CMPQ R9,$64 - JA BYTESATLEAST65 - JAE BYTESATLEAST64 - MOVQ DI,SI - MOVQ DX,DI - MOVQ R9,CX + LEAQ 360(R12), DI + LEAQ 360(R12), SI + +NOCOPY: + MOVQ R9, 352(R12) + MOVOA 48(R12), X0 + MOVOA (R12), X1 + MOVOA 16(R12), X2 + MOVOA 32(R12), X3 + MOVOA X1, X4 + MOVQ $0x00000014, CX + +MAINLOOP2: + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + SUBQ $0x04, CX + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PXOR X7, X7 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + JA MAINLOOP2 + PADDL 48(R12), X0 + PADDL (R12), X1 + PADDL 16(R12), X2 + PADDL 32(R12), X3 + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL (SI), CX + XORL 48(SI), R8 + XORL 32(SI), R9 + XORL 16(SI), AX + MOVL CX, (DI) + MOVL R8, 48(DI) + MOVL R9, 32(DI) + MOVL AX, 16(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 20(SI), CX + XORL 4(SI), R8 + XORL 52(SI), R9 + XORL 36(SI), AX + MOVL CX, 20(DI) + MOVL R8, 4(DI) + MOVL R9, 52(DI) + MOVL AX, 36(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 40(SI), CX + XORL 24(SI), R8 + XORL 8(SI), R9 + XORL 56(SI), AX + MOVL CX, 40(DI) + MOVL R8, 24(DI) + MOVL R9, 8(DI) + MOVL AX, 56(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + XORL 60(SI), CX + XORL 44(SI), R8 + XORL 28(SI), R9 + XORL 12(SI), AX + MOVL CX, 60(DI) + MOVL R8, 44(DI) + MOVL R9, 28(DI) + MOVL AX, 12(DI) + MOVQ 352(R12), R9 + MOVL 16(R12), CX + MOVL 36(R12), R8 + ADDQ $0x01, CX + SHLQ $0x20, R8 + ADDQ R8, CX + MOVQ CX, R8 + SHRQ $0x20, R8 + MOVL CX, 16(R12) + MOVL R8, 36(R12) + CMPQ R9, $0x40 + JA BYTESATLEAST65 + JAE BYTESATLEAST64 + MOVQ DI, SI + MOVQ DX, DI + MOVQ R9, CX REP; MOVSB - BYTESATLEAST64: - DONE: + +BYTESATLEAST64: +DONE: RET - BYTESATLEAST65: - SUBQ $64,R9 - ADDQ $64,DI - ADDQ $64,SI - JMP BYTESBETWEEN1AND255 + +BYTESATLEAST65: + SUBQ $0x40, R9 + ADDQ $0x40, DI + ADDQ $0x40, SI + JMP BYTESBETWEEN1AND255 diff --git a/vendor/modules.txt b/vendor/modules.txt index e9d29a7b5b..1c3c89f832 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -793,7 +793,7 @@ go.mozilla.org/pkcs7 # go.uber.org/multierr v1.11.0 ## explicit; go 1.19 go.uber.org/multierr -# golang.org/x/crypto v0.26.0 +# golang.org/x/crypto v0.27.0 ## explicit; go 1.20 golang.org/x/crypto/argon2 golang.org/x/crypto/bcrypt @@ -832,7 +832,7 @@ golang.org/x/exp/slog/internal/buffer # golang.org/x/mod v0.20.0 ## explicit; go 1.18 golang.org/x/mod/semver -# golang.org/x/net v0.28.0 +# golang.org/x/net v0.29.0 ## explicit; go 1.18 golang.org/x/net/bpf golang.org/x/net/context