From 7daa20bf74337a939c54f892a2eca9d9b578eb7f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 20 May 2020 03:48:34 -0700 Subject: [PATCH] Generate AVX2 code (#141) Replaces AVX2 up to 10x8 configurations with specific generated functions. If code size is a concern `-tags=nogen` can be used. Biggest speedup when not memory constrained. ``` benchmark old MB/s new MB/s speedup BenchmarkEncode_8x5x8M 5895.75 9648.18 1.64x BenchmarkEncode_8x5x8M-4 16773.41 17220.67 1.03x BenchmarkEncode_8x5x8M-16 18263.12 17176.28 0.94x BenchmarkEncode_8x6x8M 5075.89 8548.39 1.68x BenchmarkEncode_8x6x8M-4 14559.83 15370.95 1.06x BenchmarkEncode_8x6x8M-16 16183.37 15291.98 0.94x BenchmarkEncode_8x7x8M 4481.18 7015.60 1.57x BenchmarkEncode_8x7x8M-4 12835.35 13695.90 1.07x BenchmarkEncode_8x7x8M-16 14246.94 13737.36 0.96x BenchmarkEncode_8x8x05M 5569.95 7947.70 1.43x BenchmarkEncode_8x8x05M-4 17334.91 25271.37 1.46x BenchmarkEncode_8x8x05M-16 29349.42 35043.36 1.19x BenchmarkEncode_8x8x1M 4830.58 7891.32 1.63x BenchmarkEncode_8x8x1M-4 17531.36 27371.42 1.56x BenchmarkEncode_8x8x1M-16 29593.98 39241.09 1.33x BenchmarkEncode_8x8x8M 3953.66 6584.26 1.67x BenchmarkEncode_8x8x8M-4 11527.34 12331.23 1.07x BenchmarkEncode_8x8x8M-16 12718.89 12173.08 0.96x BenchmarkEncode_8x8x32M 3927.51 6195.91 1.58x BenchmarkEncode_8x8x32M-4 11490.85 11424.39 0.99x BenchmarkEncode_8x8x32M-16 12506.09 11888.55 0.95x benchmark old MB/s new MB/s speedup BenchmarkParallel_8x8x64K 5490.24 6959.57 1.27x BenchmarkParallel_8x8x64K-4 21078.94 29557.51 1.40x BenchmarkParallel_8x8x64K-16 57508.45 73672.54 1.28x BenchmarkParallel_8x8x1M 4755.49 7667.84 1.61x BenchmarkParallel_8x8x1M-4 11818.66 12013.49 1.02x BenchmarkParallel_8x8x1M-16 12923.12 12109.42 0.94x BenchmarkParallel_8x8x8M 3973.94 6525.85 1.64x BenchmarkParallel_8x8x8M-4 11725.68 11312.46 0.96x BenchmarkParallel_8x8x8M-16 12608.20 11484.98 0.91x BenchmarkParallel_8x3x1M 14139.71 17993.04 1.27x BenchmarkParallel_8x3x1M-4 21805.97 23053.92 1.06x BenchmarkParallel_8x3x1M-16 24673.05 23596.71 0.96x BenchmarkParallel_8x4x1M 10617.88 14474.54 1.36x BenchmarkParallel_8x4x1M-4 18635.82 18965.65 1.02x BenchmarkParallel_8x4x1M-16 21518.12 20171.47 0.94x BenchmarkParallel_8x5x1M 8669.88 11833.96 1.36x BenchmarkParallel_8x5x1M-4 16321.00 17500.30 1.07x BenchmarkParallel_8x5x1M-16 17267.16 17191.04 1.00x ``` --- galois.go | 27 + galoisAvx512_amd64.go | 8 +- galois_gen_amd64.go | 408 + galois_gen_amd64.s | 18526 +++++++++++++++++++++++++++++++++++ galois_gen_none.go | 11 + galois_gen_switch_amd64.go | 293 + galois_notamd64.go | 4 +- gen.go | 249 + go.mod | 4 +- go.sum | 2 - reedsolomon.go | 53 +- reedsolomon_test.go | 3 +- 12 files changed, 19565 insertions(+), 23 deletions(-) create mode 100644 galois_gen_amd64.go create mode 100644 galois_gen_amd64.s create mode 100644 galois_gen_none.go create mode 100644 galois_gen_switch_amd64.go create mode 100644 gen.go diff --git a/galois.go b/galois.go index 1fd60a0b..76049f9d 100644 --- a/galois.go +++ b/galois.go @@ -900,3 +900,30 @@ func galExp(a byte, n int) byte { } return expTable[logResult] } + +func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte { + if !avx2CodeGen { + panic("codegen not enabled") + } + total := inputs * outputs + + // Duplicated in+out + wantBytes := total * 32 * 2 + if cap(dst) < wantBytes { + dst = make([]byte, wantBytes) + } else { + dst = dst[:wantBytes] + } + for i, row := range matrixRows[:outputs] { + for j, idx := range row[:inputs] { + dstIdx := (j*outputs + i) * 64 + lo := mulTableLow[idx][:] + hi := mulTableHigh[idx][:] + copy(dst[dstIdx:], lo) + copy(dst[dstIdx+16:], lo) + copy(dst[dstIdx+32:], hi) + copy(dst[dstIdx+48:], hi) + } + } + return dst +} diff --git a/galoisAvx512_amd64.go b/galoisAvx512_amd64.go index d7d87b1d..720196fa 100644 --- a/galoisAvx512_amd64.go +++ b/galoisAvx512_amd64.go @@ -7,7 +7,9 @@ package reedsolomon -import "sync" +import ( + "sync" +) //go:noescape func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) @@ -224,7 +226,7 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 -func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { // Process using no goroutines start, end := 0, r.o.perRound if end > byteCount { @@ -271,7 +273,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 -func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go new file mode 100644 index 00000000..edd6376a --- /dev/null +++ b/galois_gen_amd64.go @@ -0,0 +1,408 @@ +// Code generated by command: go run gen.go -out galois_gen_amd64.s -stubs galois_gen_amd64.go. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build !nogen +// +build gc + +package reedsolomon + +// mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +//go:noescape +func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s new file mode 100644 index 00000000..c76db3c8 --- /dev/null +++ b/galois_gen_amd64.s @@ -0,0 +1,18526 @@ +// Code generated by command: go run gen.go -out galois_gen_amd64.s -stubs galois_gen_amd64.go. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build !nogen +// +build gc + +// func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x1(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ $0x0000000f, BX + MOVQ BX, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), BX + +mulAvxTwo_1x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX)(BX*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y3, Y4, Y4 + VPAND Y3, Y5, Y5 + VPSHUFB Y4, Y1, Y4 + VPSHUFB Y5, Y2, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(BX*1) + + // Prepare for next loop + ADDQ $0x20, BX + DECQ AX + JNZ mulAvxTwo_1x1_loop + VZEROUPPER + +mulAvxTwo_1x1_end: + RET + +// func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x2(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), BP + +mulAvxTwo_1x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX)(BP*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VPSHUFB Y9, Y2, Y7 + VPSHUFB Y10, Y3, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VPSHUFB Y9, Y4, Y7 + VPSHUFB Y10, Y5, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(BP*1) + VMOVDQU Y1, (DX)(BP*1) + + // Prepare for next loop + ADDQ $0x20, BP + DECQ AX + JNZ mulAvxTwo_1x2_loop + VZEROUPPER + +mulAvxTwo_1x2_end: + RET + +// func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x3(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ $0x0000000f, SI + MOVQ SI, X9 + VPBROADCASTB X9, Y9 + MOVQ start+72(FP), SI + +mulAvxTwo_1x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX)(SI*1), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VPSHUFB Y12, Y3, Y10 + VPSHUFB Y13, Y4, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VPSHUFB Y12, Y5, Y10 + VPSHUFB Y13, Y6, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VPSHUFB Y12, Y7, Y10 + VPSHUFB Y13, Y8, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(SI*1) + VMOVDQU Y1, (BP)(SI*1) + VMOVDQU Y2, (DX)(SI*1) + + // Prepare for next loop + ADDQ $0x20, SI + DECQ AX + JNZ mulAvxTwo_1x3_loop + VZEROUPPER + +mulAvxTwo_1x3_end: + RET + +// func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), DI + MOVQ $0x0000000f, R8 + MOVQ R8, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R8 + +mulAvxTwo_1x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DI)(R8*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R8*1) + VMOVDQU Y1, (BP)(R8*1) + VMOVDQU Y2, (SI)(R8*1) + VMOVDQU Y3, (DX)(R8*1) + + // Prepare for next loop + ADDQ $0x20, R8 + DECQ AX + JNZ mulAvxTwo_1x4_loop + VZEROUPPER + +mulAvxTwo_1x4_end: + RET + +// func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R8 + MOVQ $0x0000000f, R9 + MOVQ R9, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R9 + +mulAvxTwo_1x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R8)(R9*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R9*1) + VMOVDQU Y1, (BP)(R9*1) + VMOVDQU Y2, (SI)(R9*1) + VMOVDQU Y3, (DI)(R9*1) + VMOVDQU Y4, (DX)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_1x5_loop + VZEROUPPER + +mulAvxTwo_1x5_end: + RET + +// func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x6_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), DX + MOVQ in_base+24(FP), R9 + MOVQ (R9), R9 + MOVQ $0x0000000f, R10 + MOVQ R10, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R10 + +mulAvxTwo_1x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (R9)(R10*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (BX)(R10*1) + VMOVDQU Y1, (BP)(R10*1) + VMOVDQU Y2, (SI)(R10*1) + VMOVDQU Y3, (DI)(R10*1) + VMOVDQU Y4, (R8)(R10*1) + VMOVDQU Y5, (DX)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_1x6_loop + VZEROUPPER + +mulAvxTwo_1x6_end: + RET + +// func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x7_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), DX + MOVQ in_base+24(FP), R10 + MOVQ (R10), R10 + MOVQ $0x0000000f, R11 + MOVQ R11, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R11 + +mulAvxTwo_1x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (R10)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (BX)(R11*1) + VMOVDQU Y1, (BP)(R11*1) + VMOVDQU Y2, (SI)(R11*1) + VMOVDQU Y3, (DI)(R11*1) + VMOVDQU Y4, (R8)(R11*1) + VMOVDQU Y5, (R9)(R11*1) + VMOVDQU Y6, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_1x7_loop + VZEROUPPER + +mulAvxTwo_1x7_end: + RET + +// func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x8_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), R10 + MOVQ 168(DX), DX + MOVQ in_base+24(FP), R11 + MOVQ (R11), R11 + MOVQ $0x0000000f, R12 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R12 + +mulAvxTwo_1x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (R11)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (BP)(R12*1) + VMOVDQU Y2, (SI)(R12*1) + VMOVDQU Y3, (DI)(R12*1) + VMOVDQU Y4, (R8)(R12*1) + VMOVDQU Y5, (R9)(R12*1) + VMOVDQU Y6, (R10)(R12*1) + VMOVDQU Y7, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_1x8_loop + VZEROUPPER + +mulAvxTwo_1x8_end: + RET + +// func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x1(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), BX + MOVQ 24(CX), CX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), BP + +mulAvxTwo_2x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX)(BP*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y5, Y6, Y6 + VPAND Y5, Y7, Y7 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y7, Y2, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX)(BP*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y5, Y6, Y6 + VPAND Y5, Y7, Y7 + VPSHUFB Y6, Y3, Y6 + VPSHUFB Y7, Y4, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(BP*1) + + // Prepare for next loop + ADDQ $0x20, BP + DECQ AX + JNZ mulAvxTwo_2x1_loop + VZEROUPPER + +mulAvxTwo_2x1_end: + RET + +// func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x2(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 15 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), BP + MOVQ 24(CX), CX + MOVQ $0x0000000f, SI + MOVQ SI, X10 + VPBROADCASTB X10, Y10 + MOVQ start+72(FP), SI + +mulAvxTwo_2x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BP)(SI*1), Y13 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VPSHUFB Y13, Y2, Y11 + VPSHUFB Y14, Y3, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VPSHUFB Y13, Y4, Y11 + VPSHUFB Y14, Y5, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX)(SI*1), Y13 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VPSHUFB Y13, Y6, Y11 + VPSHUFB Y14, Y7, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VPSHUFB Y13, Y8, Y11 + VPSHUFB Y14, Y9, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(SI*1) + VMOVDQU Y1, (DX)(SI*1) + + // Prepare for next loop + ADDQ $0x20, SI + DECQ AX + JNZ mulAvxTwo_2x2_loop + VZEROUPPER + +mulAvxTwo_2x2_end: + RET + +// func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ $0x0000000f, R8 + MOVQ R8, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R8 + +mulAvxTwo_2x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R8*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI)(R8*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R8*1) + VMOVDQU Y1, (BP)(R8*1) + VMOVDQU Y2, (DX)(R8*1) + + // Prepare for next loop + ADDQ $0x20, R8 + DECQ AX + JNZ mulAvxTwo_2x3_loop + VZEROUPPER + +mulAvxTwo_2x3_end: + RET + +// func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R9 + +mulAvxTwo_2x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R9*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (DI)(R9*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R9*1) + VMOVDQU Y1, (BP)(R9*1) + VMOVDQU Y2, (SI)(R9*1) + VMOVDQU Y3, (DX)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_2x4_loop + VZEROUPPER + +mulAvxTwo_2x4_end: + RET + +// func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ $0x0000000f, R10 + MOVQ R10, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R10 + +mulAvxTwo_2x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R9)(R10*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (R8)(R10*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R10*1) + VMOVDQU Y1, (BP)(R10*1) + VMOVDQU Y2, (SI)(R10*1) + VMOVDQU Y3, (DI)(R10*1) + VMOVDQU Y4, (DX)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_2x5_loop + VZEROUPPER + +mulAvxTwo_2x5_end: + RET + +// func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x6_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), DX + MOVQ in_base+24(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R11 + +mulAvxTwo_2x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (R10)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (R9)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (BX)(R11*1) + VMOVDQU Y1, (BP)(R11*1) + VMOVDQU Y2, (SI)(R11*1) + VMOVDQU Y3, (DI)(R11*1) + VMOVDQU Y4, (R8)(R11*1) + VMOVDQU Y5, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_2x6_loop + VZEROUPPER + +mulAvxTwo_2x6_end: + RET + +// func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x7_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), DX + MOVQ in_base+24(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ $0x0000000f, R12 + MOVQ R12, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R12 + +mulAvxTwo_2x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (R11)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (R10)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (BP)(R12*1) + VMOVDQU Y2, (SI)(R12*1) + VMOVDQU Y3, (DI)(R12*1) + VMOVDQU Y4, (R8)(R12*1) + VMOVDQU Y5, (R9)(R12*1) + VMOVDQU Y6, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_2x7_loop + VZEROUPPER + +mulAvxTwo_2x7_end: + RET + +// func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 45 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x8_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), R10 + MOVQ 168(DX), DX + MOVQ in_base+24(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R13 + +mulAvxTwo_2x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (R12)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (R11)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (BP)(R13*1) + VMOVDQU Y2, (SI)(R13*1) + VMOVDQU Y3, (DI)(R13*1) + VMOVDQU Y4, (R8)(R13*1) + VMOVDQU Y5, (R9)(R13*1) + VMOVDQU Y6, (R10)(R13*1) + VMOVDQU Y7, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_2x8_loop + VZEROUPPER + +mulAvxTwo_2x8_end: + RET + +// func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x1(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), BX + MOVQ 24(CX), BP + MOVQ 48(CX), CX + MOVQ $0x0000000f, SI + MOVQ SI, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), SI + +mulAvxTwo_3x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX)(SI*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y9, Y2, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BP)(SI*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y9, Y4, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX)(SI*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y5, Y8 + VPSHUFB Y9, Y6, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(SI*1) + + // Prepare for next loop + ADDQ $0x20, SI + DECQ AX + JNZ mulAvxTwo_3x1_loop + VZEROUPPER + +mulAvxTwo_3x1_end: + RET + +// func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 19 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), BP + MOVQ $0x0000000f, R8 + MOVQ R8, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R8 + +mulAvxTwo_3x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R8*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R8*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (BP)(R8*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R8*1) + VMOVDQU Y1, (DX)(R8*1) + + // Prepare for next loop + ADDQ $0x20, R8 + DECQ AX + JNZ mulAvxTwo_3x2_loop + VZEROUPPER + +mulAvxTwo_3x2_end: + RET + +// func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ $0x0000000f, R9 + MOVQ R9, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R9 + +mulAvxTwo_3x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R9*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R9*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI)(R9*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R9*1) + VMOVDQU Y1, (BP)(R9*1) + VMOVDQU Y2, (DX)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_3x3_loop + VZEROUPPER + +mulAvxTwo_3x3_end: + RET + +// func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R10 + +mulAvxTwo_3x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R10*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (R9)(R10*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI)(R10*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R10*1) + VMOVDQU Y1, (BP)(R10*1) + VMOVDQU Y2, (SI)(R10*1) + VMOVDQU Y3, (DX)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_3x4_loop + VZEROUPPER + +mulAvxTwo_3x4_end: + RET + +// func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ $0x0000000f, R11 + MOVQ R11, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R11 + +mulAvxTwo_3x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R9)(R11*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (R10)(R11*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (R8)(R11*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R11*1) + VMOVDQU Y1, (BP)(R11*1) + VMOVDQU Y2, (SI)(R11*1) + VMOVDQU Y3, (DI)(R11*1) + VMOVDQU Y4, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_3x5_loop + VZEROUPPER + +mulAvxTwo_3x5_end: + RET + +// func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x6_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), DX + MOVQ in_base+24(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R12 + +mulAvxTwo_3x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (R10)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (R11)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (R9)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (BP)(R12*1) + VMOVDQU Y2, (SI)(R12*1) + VMOVDQU Y3, (DI)(R12*1) + VMOVDQU Y4, (R8)(R12*1) + VMOVDQU Y5, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_3x6_loop + VZEROUPPER + +mulAvxTwo_3x6_end: + RET + +// func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 54 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x7_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), DX + MOVQ in_base+24(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R13 + +mulAvxTwo_3x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (R11)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (R12)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (R10)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (BP)(R13*1) + VMOVDQU Y2, (SI)(R13*1) + VMOVDQU Y3, (DI)(R13*1) + VMOVDQU Y4, (R8)(R13*1) + VMOVDQU Y5, (R9)(R13*1) + VMOVDQU Y6, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_3x7_loop + VZEROUPPER + +mulAvxTwo_3x7_end: + RET + +// func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 61 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x8_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), R10 + MOVQ 168(DX), DX + MOVQ in_base+24(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R14 + +mulAvxTwo_3x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (R12)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (R13)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (R11)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (BP)(R14*1) + VMOVDQU Y2, (SI)(R14*1) + VMOVDQU Y3, (DI)(R14*1) + VMOVDQU Y4, (R8)(R14*1) + VMOVDQU Y5, (R9)(R14*1) + VMOVDQU Y6, (R10)(R14*1) + VMOVDQU Y7, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_3x8_loop + VZEROUPPER + +mulAvxTwo_3x8_end: + RET + +// func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x1(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), BX + MOVQ 24(CX), BP + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ $0x0000000f, DI + MOVQ DI, X9 + VPBROADCASTB X9, Y9 + MOVQ start+72(FP), DI + +mulAvxTwo_4x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX)(DI*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y1, Y10 + VPSHUFB Y11, Y2, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BP)(DI*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y3, Y10 + VPSHUFB Y11, Y4, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI)(DI*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y5, Y10 + VPSHUFB Y11, Y6, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX)(DI*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y7, Y10 + VPSHUFB Y11, Y8, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(DI*1) + + // Prepare for next loop + ADDQ $0x20, DI + DECQ AX + JNZ mulAvxTwo_4x1_loop + VZEROUPPER + +mulAvxTwo_4x1_end: + RET + +// func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), BP + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R9 + +mulAvxTwo_4x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R9*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R9*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R9*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (BP)(R9*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R9*1) + VMOVDQU Y1, (DX)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_4x2_loop + VZEROUPPER + +mulAvxTwo_4x2_end: + RET + +// func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ $0x0000000f, R10 + MOVQ R10, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R10 + +mulAvxTwo_4x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R10*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R10*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (R9)(R10*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (SI)(R10*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R10*1) + VMOVDQU Y1, (BP)(R10*1) + VMOVDQU Y2, (DX)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_4x3_loop + VZEROUPPER + +mulAvxTwo_4x3_end: + RET + +// func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R11 + +mulAvxTwo_4x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R11*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (R9)(R11*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (R10)(R11*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI)(R11*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R11*1) + VMOVDQU Y1, (BP)(R11*1) + VMOVDQU Y2, (SI)(R11*1) + VMOVDQU Y3, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_4x4_loop + VZEROUPPER + +mulAvxTwo_4x4_end: + RET + +// func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ $0x0000000f, R12 + MOVQ R12, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R12 + +mulAvxTwo_4x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R9)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (R10)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (R11)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (BP)(R12*1) + VMOVDQU Y2, (SI)(R12*1) + VMOVDQU Y3, (DI)(R12*1) + VMOVDQU Y4, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_4x5_loop + VZEROUPPER + +mulAvxTwo_4x5_end: + RET + +// func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 59 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x6_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), DX + MOVQ in_base+24(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R13 + +mulAvxTwo_4x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (R10)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (R11)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (R12)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R9)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (BP)(R13*1) + VMOVDQU Y2, (SI)(R13*1) + VMOVDQU Y3, (DI)(R13*1) + VMOVDQU Y4, (R8)(R13*1) + VMOVDQU Y5, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_4x6_loop + VZEROUPPER + +mulAvxTwo_4x6_end: + RET + +// func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x7_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), DX + MOVQ in_base+24(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R14 + +mulAvxTwo_4x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (R11)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (R12)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (R13)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R10)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (BP)(R14*1) + VMOVDQU Y2, (SI)(R14*1) + VMOVDQU Y3, (DI)(R14*1) + VMOVDQU Y4, (R8)(R14*1) + VMOVDQU Y5, (R9)(R14*1) + VMOVDQU Y6, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_4x7_loop + VZEROUPPER + +mulAvxTwo_4x7_end: + RET + +// func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 77 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x8_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), R10 + MOVQ 168(DX), DX + MOVQ in_base+24(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R15 + +mulAvxTwo_4x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (R12)(R15*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (R13)(R15*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (R14)(R15*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R11)(R15*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y2, (SI)(R15*1) + VMOVDQU Y3, (DI)(R15*1) + VMOVDQU Y4, (R8)(R15*1) + VMOVDQU Y5, (R9)(R15*1) + VMOVDQU Y6, (R10)(R15*1) + VMOVDQU Y7, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_4x8_loop + VZEROUPPER + +mulAvxTwo_4x8_end: + RET + +// func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x1(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + MOVQ in_base+24(FP), CX + MOVQ (CX), BX + MOVQ 24(CX), BP + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ $0x0000000f, R8 + MOVQ R8, X11 + VPBROADCASTB X11, Y11 + MOVQ start+72(FP), R8 + +mulAvxTwo_5x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX)(R8*1), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y1, Y12 + VPSHUFB Y13, Y2, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BP)(R8*1), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y3, Y12 + VPSHUFB Y13, Y4, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI)(R8*1), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y5, Y12 + VPSHUFB Y13, Y6, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI)(R8*1), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y7, Y12 + VPSHUFB Y13, Y8, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX)(R8*1), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y9, Y12 + VPSHUFB Y13, Y10, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(R8*1) + + // Prepare for next loop + ADDQ $0x20, R8 + DECQ AX + JNZ mulAvxTwo_5x1_loop + VZEROUPPER + +mulAvxTwo_5x1_end: + RET + +// func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), R9 + MOVQ 96(BP), BP + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R10 + +mulAvxTwo_5x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R10*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R10*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R10*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R9)(R10*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (BP)(R10*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R10*1) + VMOVDQU Y1, (DX)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x2_loop + VZEROUPPER + +mulAvxTwo_5x2_end: + RET + +// func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ $0x0000000f, R11 + MOVQ R11, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R11 + +mulAvxTwo_5x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R11*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R11*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (R9)(R11*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R10)(R11*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (SI)(R11*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R11*1) + VMOVDQU Y1, (BP)(R11*1) + VMOVDQU Y2, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_5x3_loop + VZEROUPPER + +mulAvxTwo_5x3_end: + RET + +// func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R12 + +mulAvxTwo_5x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R12*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (R9)(R12*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (R10)(R12*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R11)(R12*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DI)(R12*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (BP)(R12*1) + VMOVDQU Y2, (SI)(R12*1) + VMOVDQU Y3, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_5x4_loop + VZEROUPPER + +mulAvxTwo_5x4_end: + RET + +// func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 60 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R13 + +mulAvxTwo_5x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R9)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (R10)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (R11)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R12)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (BP)(R13*1) + VMOVDQU Y2, (SI)(R13*1) + VMOVDQU Y3, (DI)(R13*1) + VMOVDQU Y4, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_5x5_loop + VZEROUPPER + +mulAvxTwo_5x5_end: + RET + +// func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 71 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x6_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), DX + MOVQ in_base+24(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R14 + +mulAvxTwo_5x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (R10)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (R11)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (R12)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R13)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (BP)(R14*1) + VMOVDQU Y2, (SI)(R14*1) + VMOVDQU Y3, (DI)(R14*1) + VMOVDQU Y4, (R8)(R14*1) + VMOVDQU Y5, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_5x6_loop + VZEROUPPER + +mulAvxTwo_5x6_end: + RET + +// func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x7_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), R9 + MOVQ 144(DX), DX + MOVQ in_base+24(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R15 + +mulAvxTwo_5x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (R11)(R15*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (R12)(R15*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (R13)(R15*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R14)(R15*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R10)(R15*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y2, (SI)(R15*1) + VMOVDQU Y3, (DI)(R15*1) + VMOVDQU Y4, (R8)(R15*1) + VMOVDQU Y5, (R9)(R15*1) + VMOVDQU Y6, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_5x7_loop + VZEROUPPER + +mulAvxTwo_5x7_end: + RET + +// func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 93 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x8_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), BX + MOVQ $0x0000000f, R9 + MOVQ R9, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R9 + +mulAvxTwo_5x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BP)(R9*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI)(R9*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI)(R9*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8)(R9*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (BX)(R9*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (DX), R10 + VMOVDQU Y0, (R10)(R9*1) + MOVQ 24(DX), R10 + VMOVDQU Y1, (R10)(R9*1) + MOVQ 48(DX), R10 + VMOVDQU Y2, (R10)(R9*1) + MOVQ 72(DX), R10 + VMOVDQU Y3, (R10)(R9*1) + MOVQ 96(DX), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 120(DX), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 144(DX), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 168(DX), R10 + VMOVDQU Y7, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_5x8_loop + VZEROUPPER + +mulAvxTwo_5x8_end: + RET + +// func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x1(SB), $0-88 + // Loading all tables to registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + MOVQ in_base+24(FP), CX + MOVQ (CX), BX + MOVQ 24(CX), BP + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ $0x0000000f, R9 + MOVQ R9, X13 + VPBROADCASTB X13, Y13 + MOVQ start+72(FP), R9 + +mulAvxTwo_6x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX)(R9*1), Y14 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y1, Y14 + VPSHUFB Y15, Y2, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BP)(R9*1), Y14 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y3, Y14 + VPSHUFB Y15, Y4, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI)(R9*1), Y14 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y5, Y14 + VPSHUFB Y15, Y6, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI)(R9*1), Y14 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y7, Y14 + VPSHUFB Y15, Y8, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8)(R9*1), Y14 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y9, Y14 + VPSHUFB Y15, Y10, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX)(R9*1), Y14 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y11, Y14 + VPSHUFB Y15, Y12, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_6x1_loop + VZEROUPPER + +mulAvxTwo_6x1_end: + RET + +// func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 31 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), R9 + MOVQ 96(BP), R10 + MOVQ 120(BP), BP + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R11 + +mulAvxTwo_6x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R11*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R11*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R11*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R9)(R11*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R10)(R11*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (BP)(R11*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R11*1) + VMOVDQU Y1, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x2_loop + VZEROUPPER + +mulAvxTwo_6x2_end: + RET + +// func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ $0x0000000f, R12 + MOVQ R12, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R12 + +mulAvxTwo_6x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R12*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R12*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (R9)(R12*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R10)(R12*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R11)(R12*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (SI)(R12*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (BP)(R12*1) + VMOVDQU Y2, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_6x3_loop + VZEROUPPER + +mulAvxTwo_6x3_end: + RET + +// func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R13 + +mulAvxTwo_6x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (R9)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (R10)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R11)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R12)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DI)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (BP)(R13*1) + VMOVDQU Y2, (SI)(R13*1) + VMOVDQU Y3, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_6x4_loop + VZEROUPPER + +mulAvxTwo_6x4_end: + RET + +// func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R14 + +mulAvxTwo_6x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R9)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (R10)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (R11)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R12)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R13)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R8)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (BP)(R14*1) + VMOVDQU Y2, (SI)(R14*1) + VMOVDQU Y3, (DI)(R14*1) + VMOVDQU Y4, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_6x5_loop + VZEROUPPER + +mulAvxTwo_6x5_end: + RET + +// func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x6_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), R8 + MOVQ 120(DX), DX + MOVQ in_base+24(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R15 + +mulAvxTwo_6x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (R10)(R15*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (R11)(R15*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (R12)(R15*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R13)(R15*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R14)(R15*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9)(R15*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y2, (SI)(R15*1) + VMOVDQU Y3, (DI)(R15*1) + VMOVDQU Y4, (R8)(R15*1) + VMOVDQU Y5, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_6x6_loop + VZEROUPPER + +mulAvxTwo_6x6_end: + RET + +// func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 96 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x7_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), BX + MOVQ $0x0000000f, R10 + MOVQ R10, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R10 + +mulAvxTwo_6x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BP)(R10*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI)(R10*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI)(R10*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8)(R10*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9)(R10*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (BX)(R10*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (DX), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(DX), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(DX), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(DX), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(DX), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(DX), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(DX), R11 + VMOVDQU Y6, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_6x7_loop + VZEROUPPER + +mulAvxTwo_6x7_end: + RET + +// func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 109 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x8_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), BX + MOVQ $0x0000000f, R10 + MOVQ R10, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R10 + +mulAvxTwo_6x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BP)(R10*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI)(R10*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI)(R10*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8)(R10*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9)(R10*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (BX)(R10*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (DX), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(DX), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(DX), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(DX), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(DX), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(DX), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(DX), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(DX), R11 + VMOVDQU Y7, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_6x8_loop + VZEROUPPER + +mulAvxTwo_6x8_end: + RET + +// func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), BX + MOVQ $0x0000000f, R11 + MOVQ R11, X1 + VPBROADCASTB X1, Y1 + MOVQ start+72(FP), R11 + +mulAvxTwo_7x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BP)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (BX)(R11*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_7x1_loop + VZEROUPPER + +mulAvxTwo_7x1_end: + RET + +// func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), R9 + MOVQ 96(BP), R10 + MOVQ 120(BP), R11 + MOVQ 144(BP), BP + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R12 + +mulAvxTwo_7x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R9)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R10)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R11)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (BP)(R12*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R12*1) + VMOVDQU Y1, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x2_loop + VZEROUPPER + +mulAvxTwo_7x2_end: + RET + +// func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ $0x0000000f, R13 + MOVQ R13, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R13 + +mulAvxTwo_7x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (R9)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R10)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R11)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R12)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (SI)(R13*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (BP)(R13*1) + VMOVDQU Y2, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_7x3_loop + VZEROUPPER + +mulAvxTwo_7x3_end: + RET + +// func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R14 + +mulAvxTwo_7x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (R9)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (R10)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R11)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R12)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R13)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DI)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (BP)(R14*1) + VMOVDQU Y2, (SI)(R14*1) + VMOVDQU Y3, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_7x4_loop + VZEROUPPER + +mulAvxTwo_7x4_end: + RET + +// func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 80 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x5_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DI + MOVQ 96(DX), DX + MOVQ in_base+24(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R15 + +mulAvxTwo_7x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (R9)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (R10)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (R11)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R12)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R13)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R14)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R8)(R15*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y2, (SI)(R15*1) + VMOVDQU Y3, (DI)(R15*1) + VMOVDQU Y4, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_7x5_loop + VZEROUPPER + +mulAvxTwo_7x5_end: + RET + +// func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x6_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), BX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R11 + +mulAvxTwo_7x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BP)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (BX)(R11*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (DX), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(DX), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(DX), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(DX), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(DX), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(DX), R12 + VMOVDQU Y5, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_7x6_loop + VZEROUPPER + +mulAvxTwo_7x6_end: + RET + +// func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x7_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), BX + MOVQ $0x0000000f, R11 + MOVQ R11, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R11 + +mulAvxTwo_7x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BP)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (BX)(R11*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (DX), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(DX), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(DX), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(DX), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(DX), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(DX), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(DX), R12 + VMOVDQU Y6, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_7x7_loop + VZEROUPPER + +mulAvxTwo_7x7_end: + RET + +// func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 125 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x8_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), BX + MOVQ $0x0000000f, R11 + MOVQ R11, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R11 + +mulAvxTwo_7x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BP)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (BX)(R11*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (DX), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(DX), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(DX), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(DX), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(DX), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(DX), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(DX), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(DX), R12 + VMOVDQU Y7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_7x8_loop + VZEROUPPER + +mulAvxTwo_7x8_end: + RET + +// func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), BX + MOVQ $0x0000000f, R12 + MOVQ R12, X1 + VPBROADCASTB X1, Y1 + MOVQ start+72(FP), R12 + +mulAvxTwo_8x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BP)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (BX)(R12*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_8x1_loop + VZEROUPPER + +mulAvxTwo_8x1_end: + RET + +// func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 39 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), R9 + MOVQ 96(BP), R10 + MOVQ 120(BP), R11 + MOVQ 144(BP), R12 + MOVQ 168(BP), BP + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R13 + +mulAvxTwo_8x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R9)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R10)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R11)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R12)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (BP)(R13*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R13*1) + VMOVDQU Y1, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x2_loop + VZEROUPPER + +mulAvxTwo_8x2_end: + RET + +// func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ $0x0000000f, R14 + MOVQ R14, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R14 + +mulAvxTwo_8x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (R9)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R10)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R11)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R12)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R13)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (SI)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (BP)(R14*1) + VMOVDQU Y2, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_8x3_loop + VZEROUPPER + +mulAvxTwo_8x3_end: + RET + +// func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x4_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), SI + MOVQ 72(DX), DX + MOVQ in_base+24(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R15 + +mulAvxTwo_8x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (R8)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (R9)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (R10)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R11)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R12)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R13)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R14)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DI)(R15*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y2, (SI)(R15*1) + VMOVDQU Y3, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_8x4_loop + VZEROUPPER + +mulAvxTwo_8x4_end: + RET + +// func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x5_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), BX + MOVQ $0x0000000f, R12 + MOVQ R12, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R12 + +mulAvxTwo_8x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BP)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (BX)(R12*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + MOVQ (DX), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(DX), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(DX), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(DX), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(DX), R13 + VMOVDQU Y4, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_8x5_loop + VZEROUPPER + +mulAvxTwo_8x5_end: + RET + +// func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 107 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x6_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), BX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R12 + +mulAvxTwo_8x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BP)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (BX)(R12*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (DX), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(DX), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(DX), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(DX), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(DX), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(DX), R13 + VMOVDQU Y5, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_8x6_loop + VZEROUPPER + +mulAvxTwo_8x6_end: + RET + +// func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 124 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x7_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), BX + MOVQ $0x0000000f, R12 + MOVQ R12, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R12 + +mulAvxTwo_8x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BP)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (BX)(R12*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (DX), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(DX), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(DX), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(DX), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(DX), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(DX), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(DX), R13 + VMOVDQU Y6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_8x7_loop + VZEROUPPER + +mulAvxTwo_8x7_end: + RET + +// func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 141 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x8_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), BX + MOVQ $0x0000000f, R12 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R12 + +mulAvxTwo_8x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BP)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (BX)(R12*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (DX), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(DX), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(DX), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(DX), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(DX), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(DX), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(DX), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(DX), R13 + VMOVDQU Y7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_8x8_loop + VZEROUPPER + +mulAvxTwo_8x8_end: + RET + +// func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), BX + MOVQ $0x0000000f, R13 + MOVQ R13, X1 + VPBROADCASTB X1, Y1 + MOVQ start+72(FP), R13 + +mulAvxTwo_9x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BP)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (BX)(R13*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_9x1_loop + VZEROUPPER + +mulAvxTwo_9x1_end: + RET + +// func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 43 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), R9 + MOVQ 96(BP), R10 + MOVQ 120(BP), R11 + MOVQ 144(BP), R12 + MOVQ 168(BP), R13 + MOVQ 192(BP), BP + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R14 + +mulAvxTwo_9x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R9)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R10)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R11)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R12)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R13)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (BP)(R14*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R14*1) + VMOVDQU Y1, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x2_loop + VZEROUPPER + +mulAvxTwo_9x2_end: + RET + +// func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), BP + MOVQ 48(DX), DX + MOVQ in_base+24(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ $0x0000000f, R15 + MOVQ R15, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R15 + +mulAvxTwo_9x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DI)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (R8)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (R9)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R10)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R11)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R12)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R13)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R14)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (SI)(R15*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y2, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_9x3_loop + VZEROUPPER + +mulAvxTwo_9x3_end: + RET + +// func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x4_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), BX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R13 + +mulAvxTwo_9x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BP)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (BX)(R13*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + MOVQ (DX), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(DX), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(DX), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(DX), R14 + VMOVDQU Y3, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_9x4_loop + VZEROUPPER + +mulAvxTwo_9x4_end: + RET + +// func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 100 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x5_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), BX + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R13 + +mulAvxTwo_9x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BP)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (BX)(R13*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + MOVQ (DX), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(DX), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(DX), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(DX), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(DX), R14 + VMOVDQU Y4, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_9x5_loop + VZEROUPPER + +mulAvxTwo_9x5_end: + RET + +// func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 119 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x6_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), BX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R13 + +mulAvxTwo_9x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BP)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (BX)(R13*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (DX), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(DX), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(DX), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(DX), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(DX), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(DX), R14 + VMOVDQU Y5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_9x6_loop + VZEROUPPER + +mulAvxTwo_9x6_end: + RET + +// func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 138 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x7_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), BX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R13 + +mulAvxTwo_9x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BP)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (BX)(R13*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (DX), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(DX), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(DX), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(DX), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(DX), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(DX), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(DX), R14 + VMOVDQU Y6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_9x7_loop + VZEROUPPER + +mulAvxTwo_9x7_end: + RET + +// func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 157 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x8_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), BX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R13 + +mulAvxTwo_9x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BP)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (BX)(R13*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (DX), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(DX), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(DX), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(DX), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(DX), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(DX), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(DX), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(DX), R14 + VMOVDQU Y7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_9x8_loop + VZEROUPPER + +mulAvxTwo_9x8_end: + RET + +// func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x1(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1_end + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X1 + VPBROADCASTB X1, Y1 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x1_loop: + // Clear 1 outputs + VPXOR Y0, Y0, Y0 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BP)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R13)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (BX)(R14*1), Y4 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 576(CX), Y2 + VMOVDQU 608(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (DX)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x1_loop + VZEROUPPER + +mulAvxTwo_10x1_end: + RET + +// func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x2(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x2_end + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ in_base+24(FP), BP + MOVQ (BP), SI + MOVQ 24(BP), DI + MOVQ 48(BP), R8 + MOVQ 72(BP), R9 + MOVQ 96(BP), R10 + MOVQ 120(BP), R11 + MOVQ 144(BP), R12 + MOVQ 168(BP), R13 + MOVQ 192(BP), R14 + MOVQ 216(BP), BP + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + MOVQ start+72(FP), R15 + +mulAvxTwo_10x2_loop: + // Clear 2 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (SI)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (DI)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (R8)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R9)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R10)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R11)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R12)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R13)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R14)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (BP)(R15*1), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1152(CX), Y3 + VMOVDQU 1184(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1216(CX), Y3 + VMOVDQU 1248(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (BX)(R15*1) + VMOVDQU Y1, (DX)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x2_loop + VZEROUPPER + +mulAvxTwo_10x2_end: + RET + +// func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x3(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x3_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X3 + VPBROADCASTB X3, Y3 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x3_loop: + // Clear 3 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BP)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R13)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (BX)(R14*1), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1728(CX), Y4 + VMOVDQU 1760(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1792(CX), Y4 + VMOVDQU 1824(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1856(CX), Y4 + VMOVDQU 1888(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + MOVQ (DX), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(DX), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(DX), R15 + VMOVDQU Y2, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x3_loop + VZEROUPPER + +mulAvxTwo_10x3_end: + RET + +// func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x4(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x4_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x4_loop: + // Clear 4 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BP)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (BX)(R14*1), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2304(CX), Y5 + VMOVDQU 2336(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2368(CX), Y5 + VMOVDQU 2400(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2432(CX), Y5 + VMOVDQU 2464(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2496(CX), Y5 + VMOVDQU 2528(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + MOVQ (DX), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(DX), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(DX), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(DX), R15 + VMOVDQU Y3, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x4_loop + VZEROUPPER + +mulAvxTwo_10x4_end: + RET + +// func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x5(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x5_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x5_loop: + // Clear 5 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BP)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (BX)(R14*1), Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2880(CX), Y6 + VMOVDQU 2912(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2944(CX), Y6 + VMOVDQU 2976(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 3008(CX), Y6 + VMOVDQU 3040(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 3072(CX), Y6 + VMOVDQU 3104(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 3136(CX), Y6 + VMOVDQU 3168(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + MOVQ (DX), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(DX), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(DX), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(DX), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(DX), R15 + VMOVDQU Y4, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x5_loop + VZEROUPPER + +mulAvxTwo_10x5_end: + RET + +// func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x6(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 131 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x6_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x6_loop: + // Clear 6 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BP)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (BX)(R14*1), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3456(CX), Y7 + VMOVDQU 3488(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3520(CX), Y7 + VMOVDQU 3552(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3584(CX), Y7 + VMOVDQU 3616(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3648(CX), Y7 + VMOVDQU 3680(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3712(CX), Y7 + VMOVDQU 3744(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3776(CX), Y7 + VMOVDQU 3808(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (DX), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(DX), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(DX), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(DX), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(DX), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(DX), R15 + VMOVDQU Y5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x6_loop + VZEROUPPER + +mulAvxTwo_10x6_end: + RET + +// func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x7(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 152 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x7_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x7_loop: + // Clear 7 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BP)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (BX)(R14*1), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 4032(CX), Y8 + VMOVDQU 4064(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 4096(CX), Y8 + VMOVDQU 4128(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 4160(CX), Y8 + VMOVDQU 4192(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 4224(CX), Y8 + VMOVDQU 4256(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 4288(CX), Y8 + VMOVDQU 4320(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 4352(CX), Y8 + VMOVDQU 4384(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 4416(CX), Y8 + VMOVDQU 4448(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (DX), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(DX), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(DX), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(DX), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(DX), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(DX), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(DX), R15 + VMOVDQU Y6, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x7_loop + VZEROUPPER + +mulAvxTwo_10x7_end: + RET + +// func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x8(SB), $0-88 + // Loading no tables to registers + // Full registers estimated 173 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x8_end + MOVQ out_base+48(FP), DX + MOVQ in_base+24(FP), BX + MOVQ (BX), BP + MOVQ 24(BX), SI + MOVQ 48(BX), DI + MOVQ 72(BX), R8 + MOVQ 96(BX), R9 + MOVQ 120(BX), R10 + MOVQ 144(BX), R11 + MOVQ 168(BX), R12 + MOVQ 192(BX), R13 + MOVQ 216(BX), BX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + MOVQ start+72(FP), R14 + +mulAvxTwo_10x8_loop: + // Clear 8 outputs + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + VPXOR Y6, Y6, Y6 + VPXOR Y7, Y7, Y7 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BP)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (BX)(R14*1), Y11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4608(CX), Y9 + VMOVDQU 4640(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4672(CX), Y9 + VMOVDQU 4704(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4736(CX), Y9 + VMOVDQU 4768(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4800(CX), Y9 + VMOVDQU 4832(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4864(CX), Y9 + VMOVDQU 4896(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4928(CX), Y9 + VMOVDQU 4960(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4992(CX), Y9 + VMOVDQU 5024(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 5056(CX), Y9 + VMOVDQU 5088(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (DX), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(DX), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(DX), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(DX), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(DX), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(DX), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(DX), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(DX), R15 + VMOVDQU Y7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_10x8_loop + VZEROUPPER + +mulAvxTwo_10x8_end: + RET diff --git a/galois_gen_none.go b/galois_gen_none.go new file mode 100644 index 00000000..b4917bc0 --- /dev/null +++ b/galois_gen_none.go @@ -0,0 +1,11 @@ +//+build !amd64 noasm appengine gccgo nogen + +package reedsolomon + +const maxAvx2Inputs = 0 +const maxAvx2Outputs = 0 +const avx2CodeGen = false + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { + panic("avx2 codegen not available") +} diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go new file mode 100644 index 00000000..0b49a1e6 --- /dev/null +++ b/galois_gen_switch_amd64.go @@ -0,0 +1,293 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc +// +build !nogen + +package reedsolomon + +import "fmt" + +const avx2CodeGen = true +const maxAvx2Inputs = 10 +const maxAvx2Outputs = 8 + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop - start + n = (n >> 5) << 5 + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxTwo_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_1x8(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxTwo_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_2x8(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxTwo_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_3x8(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxTwo_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_4x8(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxTwo_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_5x8(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxTwo_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_6x8(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxTwo_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_7x8(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxTwo_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_8x8(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxTwo_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_9x8(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxTwo_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_10x8(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_notamd64.go b/galois_notamd64.go index 86441007..bd15e3a2 100644 --- a/galois_notamd64.go +++ b/galois_notamd64.go @@ -4,10 +4,10 @@ package reedsolomon -func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { panic("codeSomeShardsAvx512 should not be called if built without asm") } -func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { panic("codeSomeShardsAvx512P should not be called if built without asm") } diff --git a/gen.go b/gen.go new file mode 100644 index 00000000..6fc545c4 --- /dev/null +++ b/gen.go @@ -0,0 +1,249 @@ +//+build generate + +//go:generate go run gen.go -out galois_gen_amd64.s -stubs galois_gen_amd64.go +//go:generate gofmt -w galois_gen_switch_amd64.go + +package main + +import ( + "bufio" + "fmt" + "os" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/buildtags" + . "github.com/mmcloughlin/avo/operand" + "github.com/mmcloughlin/avo/reg" +) + +// Technically we can do slightly bigger, but we stay reasonable. +const inputMax = 10 +const outputMax = 8 + +var switchDefs [inputMax][outputMax]string +var switchDefsX [inputMax][outputMax]string + +const perLoopBits = 5 +const perLoop = 1 << perLoopBits + +func main() { + Constraint(buildtags.Not("appengine").ToConstraint()) + Constraint(buildtags.Not("noasm").ToConstraint()) + Constraint(buildtags.Not("nogen").ToConstraint()) + Constraint(buildtags.Term("gc").ToConstraint()) + + for i := 1; i <= inputMax; i++ { + for j := 1; j <= outputMax; j++ { + //genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true) + genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) + } + } + f, err := os.Create("galois_gen_switch_amd64.go") + if err != nil { + panic(err) + } + defer f.Close() + w := bufio.NewWriter(f) + defer w.Flush() + w.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc +// +build !nogen + +package reedsolomon + +import "fmt" + +`) + + w.WriteString("const avx2CodeGen = true\n") + w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax)) + w.WriteString(` + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop-start +`) + + w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits)) + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} +`) + Generate() +} + +func genMulAvx2(name string, inputs int, outputs int, xor bool) { + total := inputs * outputs + + doc := []string{ + fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs), + } + if !xor { + doc = append(doc, "The output is initialized to 0.") + } + + // Load shuffle masks on every use. + var loadNone bool + // Use registers for destination registers. + var regDst = true + + // lo, hi, 1 in, 1 out, 2 tmp, 1 mask + est := total*2 + outputs + 5 + if outputs == 1 { + // We don't need to keep a copy of the input if only 1 output. + est -= 2 + } + + if est > 16 { + loadNone = true + // We run out of GP registers first, now. + if inputs+outputs > 12 { + regDst = false + } + } + + TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) + + // SWITCH DEFINITION: + s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs) + s += fmt.Sprintf("\t\t\t\treturn n\n") + switchDefs[inputs-1][outputs-1] = s + + if loadNone { + Comment("Loading no tables to registers") + } else { + // loadNone == false + Comment("Loading all tables to registers") + } + + Doc(doc...) + Pragma("noescape") + Commentf("Full registers estimated %d YMM used", est) + + length := Load(Param("n"), GP64()) + matrixBase := GP64() + MOVQ(Param("matrix").Base().MustAddr(), matrixBase) + SHRQ(U8(perLoopBits), length) + TESTQ(length, length) + JZ(LabelRef(name + "_end")) + + dst := make([]reg.VecVirtual, outputs) + dstPtr := make([]reg.GPVirtual, outputs) + outBase := Param("out").Base().MustAddr() + outSlicePtr := GP64() + MOVQ(outBase, outSlicePtr) + for i := range dst { + dst[i] = YMM() + if !regDst { + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + dstPtr[i] = ptr + } + + inLo := make([]reg.VecVirtual, total) + inHi := make([]reg.VecVirtual, total) + + for i := range inLo { + if loadNone { + break + } + tableLo := YMM() + tableHi := YMM() + VMOVDQU(Mem{Base: matrixBase, Disp: i * 64}, tableLo) + VMOVDQU(Mem{Base: matrixBase, Disp: i*64 + 32}, tableHi) + inLo[i] = tableLo + inHi[i] = tableHi + } + + inPtrs := make([]reg.GPVirtual, inputs) + inSlicePtr := GP64() + MOVQ(Param("in").Base().MustAddr(), inSlicePtr) + for i := range inPtrs { + ptr := GP64() + MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr) + inPtrs[i] = ptr + } + + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + lowMask := YMM() + MOVQ(tmpMask, lowMask.AsX()) + VPBROADCASTB(lowMask.AsX(), lowMask) + + offset := GP64() + MOVQ(Param("start").MustAddr(), offset) + Label(name + "_loop") + if xor { + Commentf("Load %d outputs", outputs) + } else { + Commentf("Clear %d outputs", outputs) + } + for i := range dst { + if xor { + if regDst { + VMOVDQU(Mem{Base: dstPtr[i], Index: offset, Scale: 1}, dst[i]) + continue + } + ptr := GP64() + MOVQ(outBase, ptr) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + } else { + VPXOR(dst[i], dst[i], dst[i]) + } + } + + lookLow, lookHigh := YMM(), YMM() + inLow, inHigh := YMM(), YMM() + for i := range inPtrs { + Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs) + VMOVDQU(Mem{Base: inPtrs[i], Index: offset, Scale: 1}, inLow) + VPSRLQ(U8(4), inLow, inHigh) + VPAND(lowMask, inLow, inLow) + VPAND(lowMask, inHigh, inHigh) + for j := range dst { + if loadNone { + VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow) + VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh) + VPSHUFB(inLow, lookLow, lookLow) + VPSHUFB(inHigh, lookHigh, lookHigh) + } else { + VPSHUFB(inLow, inLo[i*outputs+j], lookLow) + VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) + } + VPXOR(lookLow, lookHigh, lookLow) + VPXOR(lookLow, dst[j], dst[j]) + } + } + Commentf("Store %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU(dst[i], Mem{Base: dstPtr[i], Index: offset, Scale: 1}) + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) + } + Comment("Prepare for next loop") + ADDQ(U8(perLoop), offset) + DECQ(length) + JNZ(LabelRef(name + "_loop")) + VZEROUPPER() + + Label(name + "_end") + RET() +} diff --git a/go.mod b/go.mod index be07c05f..a059d867 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,6 @@ module github.com/klauspost/reedsolomon go 1.14 -require github.com/klauspost/cpuid v1.2.4 +require ( + github.com/klauspost/cpuid v1.2.4 +) diff --git a/go.sum b/go.sum index e5928c34..5a44d819 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,2 @@ -github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs= -github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/cpuid v1.2.4 h1:EBfaK0SWSwk+fgk6efYFWdzl8MwRWoOO1gkmiaTXPW4= github.com/klauspost/cpuid v1.2.4/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= diff --git a/reedsolomon.go b/reedsolomon.go index bc8654e3..13a35d21 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -113,6 +113,7 @@ type reedSolomon struct { tree inversionTree parity [][]byte o options + mPool sync.Pool } // ErrInvShardNum will be returned by New, if you attempt to create @@ -339,6 +340,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.parity[i] = r.m[dataShards+i] } + if avx2CodeGen && r.o.useAVX2 { + r.mPool.New = func() interface{} { + return make([]byte, r.Shards*2*32) + } + } return &r, err } @@ -353,7 +359,7 @@ var ErrTooFewShards = errors.New("too few shards given") // Each shard is a byte array, and they must all be the same size. // The parity shards will always be overwritten and the data shards // will remain the same. -func (r reedSolomon) Encode(shards [][]byte) error { +func (r *reedSolomon) Encode(shards [][]byte) error { if len(shards) != r.Shards { return ErrTooFewShards } @@ -374,7 +380,7 @@ func (r reedSolomon) Encode(shards [][]byte) error { // ErrInvalidInput is returned if invalid input parameter of Update. var ErrInvalidInput = errors.New("invalid input") -func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { +func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { if len(shards) != r.Shards { return ErrTooFewShards } @@ -414,7 +420,7 @@ func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { return nil } -func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize { r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount) return @@ -434,7 +440,7 @@ func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, output } } -func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { @@ -468,7 +474,7 @@ func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outpu // Verify returns true if the parity shards contain the right data. // The data is the same format as Encode. No data is modified. -func (r reedSolomon) Verify(shards [][]byte) (bool, error) { +func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { if len(shards) != r.Shards { return false, ErrTooFewShards } @@ -493,7 +499,10 @@ func (r reedSolomon) Verify(shards [][]byte) (bool, error) { // The number of outputs computed, and the // number of matrix rows used, is determined by // outputCount, which is the number of outputs to compute. -func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { + if len(outputs) == 0 { + return + } switch { case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2: r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount) @@ -511,6 +520,13 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output if end > len(inputs[0]) { end = len(inputs[0]) } + if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs { + m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) + start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) + r.mPool.Put(m) + end = len(inputs[0]) + } + for start < len(inputs[0]) { for c := 0; c < r.DataShards; c++ { in := inputs[c][start:end] @@ -532,7 +548,7 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output // Perform the same as codeSomeShards, but split the workload into // several goroutines. -func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { @@ -541,6 +557,11 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu // Make sizes divisible by 64 do = (do + 63) & (^63) start := 0 + var avx2Matrix []byte + if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs { + avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) + defer r.mPool.Put(avx2Matrix) + } for start < byteCount { if start+do > byteCount { do = byteCount - start @@ -548,6 +569,10 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu wg.Add(1) go func(start, stop int) { + if avx2CodeGen && r.o.useAVX2 && stop-start >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs { + start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + } + lstart, lstop := start, start+r.o.perRound if lstop > stop { lstop = stop @@ -579,7 +604,7 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu // checkSomeShards is mostly the same as codeSomeShards, // except this will check values and return // as soon as a difference is found. -func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { +func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize { return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount) } @@ -602,7 +627,7 @@ func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outpu return true } -func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { +func (r *reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { same := true var mu sync.RWMutex // For above @@ -706,7 +731,7 @@ func shardSize(shards [][]byte) int { // // The reconstructed shard set is complete, but integrity is not verified. // Use the Verify function to check if data set is ok. -func (r reedSolomon) Reconstruct(shards [][]byte) error { +func (r *reedSolomon) Reconstruct(shards [][]byte) error { return r.reconstruct(shards, false) } @@ -725,7 +750,7 @@ func (r reedSolomon) Reconstruct(shards [][]byte) error { // // As the reconstructed shard set may contain missing parity shards, // calling the Verify function is likely to fail. -func (r reedSolomon) ReconstructData(shards [][]byte) error { +func (r *reedSolomon) ReconstructData(shards [][]byte) error { return r.reconstruct(shards, true) } @@ -737,7 +762,7 @@ func (r reedSolomon) ReconstructData(shards [][]byte) error { // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. -func (r reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { +func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { if len(shards) != r.Shards { return ErrTooFewShards } @@ -896,7 +921,7 @@ var ErrShortData = errors.New("not enough data to fill the number of requested s // // The data will not be copied, except for the last shard, so you // should not modify the data of the input slice afterwards. -func (r reedSolomon) Split(data []byte) ([][]byte, error) { +func (r *reedSolomon) Split(data []byte) ([][]byte, error) { if len(data) == 0 { return nil, ErrShortData } @@ -945,7 +970,7 @@ var ErrReconstructRequired = errors.New("reconstruction required as one or more // If there are to few shards given, ErrTooFewShards will be returned. // If the total data size is less than outSize, ErrShortData will be returned. // If one or more required data shards are nil, ErrReconstructRequired will be returned. -func (r reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error { +func (r *reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error { // Do we have enough shards? if len(shards) < r.DataShards { return ErrTooFewShards diff --git a/reedsolomon_test.go b/reedsolomon_test.go index 63cb6066..d5480666 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -180,7 +180,7 @@ func TestEncoding(t *testing.T) { // matrix sizes to test. // note that par1 matric will fail on some combinations. -var testSizes = [][2]int{{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {14, 7}, {41, 17}, {49, 1}} +var testSizes = [][2]int{{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}} var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055} var testDataSizesShort = []int{10, 10001, 100003} @@ -1546,6 +1546,7 @@ func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) { }) } +func BenchmarkParallel_8x8x64K(b *testing.B) { benchmarkParallel(b, 8, 8, 64<<10) } func BenchmarkParallel_8x8x05M(b *testing.B) { benchmarkParallel(b, 8, 8, 512<<10) } func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 512<<10) } func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) }