Skip to content

Commit

Permalink
AVX512 accelerated version resulting in a 4x speed improvement over A…
Browse files Browse the repository at this point in the history
…VX2 (#91)

The performance on AVX512 has been accelerated for Intel CPUs. This gives speedups on a per-core basis of up to 4x compared to AVX2 as can be seen in the following table:

```
$ benchcmp avx2.txt avx512.txt
benchmark                      AVX2 MB/s    AVX512 MB/s   speedup
BenchmarkEncode8x8x1M-72       1681.35      4125.64       2.45x
BenchmarkEncode8x4x8M-72       1529.36      5507.97       3.60x
BenchmarkEncode8x8x8M-72        791.16      2952.29       3.73x
BenchmarkEncode8x8x32M-72       573.26      2168.61       3.78x
BenchmarkEncode12x4x12M-72     1234.41      4912.37       3.98x
BenchmarkEncode16x4x16M-72     1189.59      5138.01       4.32x
BenchmarkEncode24x8x24M-72      690.68      2583.70       3.74x
BenchmarkEncode24x8x48M-72      674.20      2643.31       3.92x
```
  • Loading branch information
fwessels authored and klauspost committed Feb 10, 2019
1 parent 8885f3a commit 79aee05
Show file tree
Hide file tree
Showing 11 changed files with 1,211 additions and 39 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon

# Changes

## February 8, 2019

AVX512 accelerated version added for Intel Skylake CPUs. This can give up to a 4x speed improvement as compared to AVX2. See [here](https://github.com/klauspost/reedsolomon#performance-on-avx512) for more details.

## December 18, 2018

Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
Expand Down Expand Up @@ -253,6 +257,25 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
```
# Performance on AVX512
The performance on AVX512 has been accelerated for Intel CPUs. This gives speedups on a per-core basis of up to 4x compared to AVX2 as can be seen in the following table:
```
$ benchcmp avx2.txt avx512.txt
benchmark AVX2 MB/s AVX512 MB/s speedup
BenchmarkEncode8x8x1M-72 1681.35 4125.64 2.45x
BenchmarkEncode8x4x8M-72 1529.36 5507.97 3.60x
BenchmarkEncode8x8x8M-72 791.16 2952.29 3.73x
BenchmarkEncode8x8x32M-72 573.26 2168.61 3.78x
BenchmarkEncode12x4x12M-72 1234.41 4912.37 3.98x
BenchmarkEncode16x4x16M-72 1189.59 5138.01 4.32x
BenchmarkEncode24x8x24M-72 690.68 2583.70 3.74x
BenchmarkEncode24x8x48M-72 674.20 2643.31 3.92x
```
This speedup has been achieved by computing multiple parity blocks in parallel as opposed to one after the other. In doing so it is possible to minimize the memory bandwidth required for loading all data shards. At the same time the calculations are performed in the 512-bit wide ZMM registers and the surplus of ZMM registers (32 in total) is used to keep more data around (most notably the matrix coefficients).
# Performance on ARM64 NEON
By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
Expand Down
184 changes: 184 additions & 0 deletions galoisAvx512_amd64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
//+build !noasm
//+build !appengine
//+build !gccgo

// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.

package reedsolomon

//go:noescape
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)

//go:noescape
func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)

const (
dimIn = 8 // Number of input rows processed simultaneously
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
)

// Construct block of matrix coefficients for 2 outputs rows in parallel
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
if c < len(matrixRows[iRow]) {
coeff := matrixRows[iRow][c]
copy(matrix[offset*32:], mulTableLow[coeff][:])
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
} else {
// coefficients not used for this input shard (so null out)
v := matrix[offset*32 : offset*32+32]
for i := range v {
v[i] = 0
}
}
offset += dimIn
if offset >= dimIn*dimOut82 {
offset -= dimIn*dimOut82 - 1
}
}
}
}

// Construct block of matrix coefficients for 4 outputs rows in parallel
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
if c < len(matrixRows[iRow]) {
coeff := matrixRows[iRow][c]
copy(matrix[offset*32:], mulTableLow[coeff][:])
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
} else {
// coefficients not used for this input shard (so null out)
v := matrix[offset*32 : offset*32+32]
for i := range v {
v[i] = 0
}
}
offset += dimIn
if offset >= dimIn*dimOut84 {
offset -= dimIn*dimOut84 - 1
}
}
}
}

// Invoke AVX512 routine for 2 output rows in parallel
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
done := len(in[0])
if done == 0 {
return
}

inputEnd := inputOffset + dimIn
if inputEnd > len(in) {
inputEnd = len(in)
}
outputEnd := outputOffset + dimOut82
if outputEnd > len(out) {
outputEnd = len(out)
}

matrix82 := [matrixSize82]byte{}
setupMatrix82(matrixRows, inputOffset, outputOffset, &matrix82)
addTo := inputOffset != 0 // Except for the first input column, add to previous results
_galMulAVX512Parallel82(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix82, addTo)

done = (done >> 6) << 6
if len(in[0])-done == 0 {
return
}

for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
if c < len(matrixRows[iRow]) {
mt := mulTable[matrixRows[iRow][c]]
for i := done; i < len(in[0]); i++ {
if c == 0 { // only set value for first input column
out[iRow][i] = mt[in[c][i]]
} else { // and add for all others
out[iRow][i] ^= mt[in[c][i]]
}
}
}
}
}
}

// Invoke AVX512 routine for 4 output rows in parallel
func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
done := len(in[0])
if done == 0 {
return
}

inputEnd := inputOffset + dimIn
if inputEnd > len(in) {
inputEnd = len(in)
}
outputEnd := outputOffset + dimOut84
if outputEnd > len(out) {
outputEnd = len(out)
}

matrix84 := [matrixSize84]byte{}
setupMatrix84(matrixRows, inputOffset, outputOffset, &matrix84)
addTo := inputOffset != 0 // Except for the first input column, add to previous results
_galMulAVX512Parallel84(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix84, addTo)

done = (done >> 6) << 6
if len(in[0])-done == 0 {
return
}

for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
if c < len(matrixRows[iRow]) {
mt := mulTable[matrixRows[iRow][c]]
for i := done; i < len(in[0]); i++ {
if c == 0 { // only set value for first input column
out[iRow][i] = mt[in[c][i]]
} else { // and add for all others
out[iRow][i] ^= mt[in[c][i]]
}
}
}
}
}
}

// Perform the same as codeSomeShards, but taking advantage of
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
outputRow := 0
// First process (multiple) batches of 4 output rows in parallel
for ; outputRow+dimOut84 <= len(outputs); outputRow += dimOut84 {
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow)
}
}
// Then process a (single) batch of 2 output rows in parallel
if outputRow+dimOut82 <= len(outputs) {
// fmt.Println(outputRow, len(outputs))
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow)
}
outputRow += dimOut82
}
// Lastly, we may have a single output row left (for uneven parity)
if outputRow < len(outputs) {
for c := 0; c < r.DataShards; c++ {
if c == 0 {
galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
} else {
galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
}
}
}
}
Loading

0 comments on commit 79aee05

Please sign in to comment.