Skip to content

Commit

Permalink
Generate AVX2 code (#141)
Browse files Browse the repository at this point in the history
Replaces AVX2 up to 10x8 configurations with specific generated functions.

If code size is a concern `-tags=nogen` can be used.

Biggest speedup when not memory constrained.
```
benchmark                                old MB/s      new MB/s      speedup
BenchmarkEncode_8x5x8M                   5895.75       9648.18       1.64x
BenchmarkEncode_8x5x8M-4                 16773.41      17220.67      1.03x
BenchmarkEncode_8x5x8M-16                18263.12      17176.28      0.94x
BenchmarkEncode_8x6x8M                   5075.89       8548.39       1.68x
BenchmarkEncode_8x6x8M-4                 14559.83      15370.95      1.06x
BenchmarkEncode_8x6x8M-16                16183.37      15291.98      0.94x
BenchmarkEncode_8x7x8M                   4481.18       7015.60       1.57x
BenchmarkEncode_8x7x8M-4                 12835.35      13695.90      1.07x
BenchmarkEncode_8x7x8M-16                14246.94      13737.36      0.96x 
BenchmarkEncode_8x8x05M                  5569.95       7947.70       1.43x
BenchmarkEncode_8x8x05M-4                17334.91      25271.37      1.46x
BenchmarkEncode_8x8x05M-16               29349.42      35043.36      1.19x
BenchmarkEncode_8x8x1M                   4830.58       7891.32       1.63x
BenchmarkEncode_8x8x1M-4                 17531.36      27371.42      1.56x
BenchmarkEncode_8x8x1M-16                29593.98      39241.09      1.33x
BenchmarkEncode_8x8x8M                   3953.66       6584.26       1.67x
BenchmarkEncode_8x8x8M-4                 11527.34      12331.23      1.07x
BenchmarkEncode_8x8x8M-16                12718.89      12173.08      0.96x
BenchmarkEncode_8x8x32M                  3927.51       6195.91       1.58x
BenchmarkEncode_8x8x32M-4                11490.85      11424.39      0.99x
BenchmarkEncode_8x8x32M-16               12506.09      11888.55      0.95x

benchmark                          old MB/s     new MB/s     speedup
BenchmarkParallel_8x8x64K          5490.24      6959.57      1.27x
BenchmarkParallel_8x8x64K-4        21078.94     29557.51     1.40x
BenchmarkParallel_8x8x64K-16       57508.45     73672.54     1.28x
BenchmarkParallel_8x8x1M           4755.49      7667.84      1.61x
BenchmarkParallel_8x8x1M-4         11818.66     12013.49     1.02x
BenchmarkParallel_8x8x1M-16        12923.12     12109.42     0.94x
BenchmarkParallel_8x8x8M           3973.94      6525.85      1.64x
BenchmarkParallel_8x8x8M-4         11725.68     11312.46     0.96x
BenchmarkParallel_8x8x8M-16        12608.20     11484.98     0.91x
BenchmarkParallel_8x3x1M           14139.71     17993.04     1.27x
BenchmarkParallel_8x3x1M-4         21805.97     23053.92     1.06x
BenchmarkParallel_8x3x1M-16        24673.05     23596.71     0.96x
BenchmarkParallel_8x4x1M           10617.88     14474.54     1.36x
BenchmarkParallel_8x4x1M-4         18635.82     18965.65     1.02x
BenchmarkParallel_8x4x1M-16        21518.12     20171.47     0.94x
BenchmarkParallel_8x5x1M           8669.88      11833.96     1.36x
BenchmarkParallel_8x5x1M-4         16321.00     17500.30     1.07x
BenchmarkParallel_8x5x1M-16        17267.16     17191.04     1.00x
```
  • Loading branch information
klauspost authored May 20, 2020
1 parent 01b307e commit 7daa20b
Show file tree
Hide file tree
Showing 12 changed files with 19,565 additions and 23 deletions.
27 changes: 27 additions & 0 deletions galois.go
Original file line number Diff line number Diff line change
Expand Up @@ -900,3 +900,30 @@ func galExp(a byte, n int) byte {
}
return expTable[logResult]
}

func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte {
if !avx2CodeGen {
panic("codegen not enabled")
}
total := inputs * outputs

// Duplicated in+out
wantBytes := total * 32 * 2
if cap(dst) < wantBytes {
dst = make([]byte, wantBytes)
} else {
dst = dst[:wantBytes]
}
for i, row := range matrixRows[:outputs] {
for j, idx := range row[:inputs] {
dstIdx := (j*outputs + i) * 64
lo := mulTableLow[idx][:]
hi := mulTableHigh[idx][:]
copy(dst[dstIdx:], lo)
copy(dst[dstIdx+16:], lo)
copy(dst[dstIdx+32:], hi)
copy(dst[dstIdx+48:], hi)
}
}
return dst
}
8 changes: 5 additions & 3 deletions galoisAvx512_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

package reedsolomon

import "sync"
import (
"sync"
)

//go:noescape
func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
Expand Down Expand Up @@ -224,7 +226,7 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp

// Perform the same as codeSomeShards, but taking advantage of
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
// Process using no goroutines
start, end := 0, r.o.perRound
if end > byteCount {
Expand Down Expand Up @@ -271,7 +273,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,

// Perform the same as codeSomeShards, but taking advantage of
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
var wg sync.WaitGroup
do := byteCount / r.o.maxGoroutines
if do < r.o.minSplitSize {
Expand Down
Loading

0 comments on commit 7daa20b

Please sign in to comment.