diff --git a/README.md b/README.md index aceff056..c53188ec 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon # Changes +## December 18, 2018 + +Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform. + ## November 18, 2017 Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU. @@ -259,6 +263,18 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be | 10 | 2 | 20% | 188 | 1738 | 925% | | 10 | 4 | 40% | 96 | 839 | 877% | +# Performance on ppc64le + +The performance for ppc64le has been accelerated. This gives roughly a 10x performance improvement on this architecture as can been seen below: + +``` +benchmark old MB/s new MB/s speedup +BenchmarkGalois128K-160 948.87 8878.85 9.36x +BenchmarkGalois1M-160 968.85 9041.92 9.33x +BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x +BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x +``` + # asm2plan9s [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents. diff --git a/galois_noasm.go b/galois_noasm.go index 7577d72f..81d5597e 100644 --- a/galois_noasm.go +++ b/galois_noasm.go @@ -1,5 +1,6 @@ //+build !amd64 noasm appengine gccgo //+build !arm64 noasm appengine gccgo +//+build !ppc64le noasm appengine gccgo // Copyright 2015, Klaus Post, see LICENSE for details. diff --git a/galois_ppc64le.go b/galois_ppc64le.go new file mode 100644 index 00000000..9033279c --- /dev/null +++ b/galois_ppc64le.go @@ -0,0 +1,67 @@ +//+build !noasm +//+build !appengine +//+build !gccgo + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2018, Minio, Inc. + +package reedsolomon + +//go:noescape +func galMulPpc(low, high, in, out []byte) + +//go:noescape +func galMulPpcXor(low, high, in, out []byte) + +// This is what the assembler routines do in blocks of 16 bytes: +/* +func galMulPpc(low, high, in, out []byte) { + for n, input := range in { + l := input & 0xf + h := input >> 4 + out[n] = low[l] ^ high[h] + } +} +func galMulPpcXor(low, high, in, out []byte) { + for n, input := range in { + l := input & 0xf + h := input >> 4 + out[n] ^= low[l] ^ high[h] + } +} +*/ + +func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) { + done := (len(in) >> 4) << 4 + if done > 0 { + galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) + } + remain := len(in) - done + if remain > 0 { + mt := mulTable[c] + for i := done; i < len(in); i++ { + out[i] = mt[in[i]] + } + } +} + +func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) { + done := (len(in) >> 4) << 4 + if done > 0 { + galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) + } + remain := len(in) - done + if remain > 0 { + mt := mulTable[c] + for i := done; i < len(in); i++ { + out[i] ^= mt[in[i]] + } + } +} + +// slice galois add +func sliceXor(in, out []byte, sse2 bool) { + for n, input := range in { + out[n] ^= input + } +} diff --git a/galois_ppc64le.s b/galois_ppc64le.s new file mode 100644 index 00000000..960087c3 --- /dev/null +++ b/galois_ppc64le.s @@ -0,0 +1,126 @@ +//+build !noasm !appengine !gccgo + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2018, Minio, Inc. + +#include "textflag.h" + +#define LOW R3 +#define HIGH R4 +#define IN R5 +#define LEN R6 +#define OUT R7 +#define CONSTANTS R8 +#define OFFSET R9 +#define OFFSET1 R10 +#define OFFSET2 R11 + +#define X6 VS34 +#define X6_ V2 +#define X7 VS35 +#define X7_ V3 +#define MSG VS36 +#define MSG_ V4 +#define MSG_HI VS37 +#define MSG_HI_ V5 +#define RESULT VS38 +#define RESULT_ V6 +#define ROTATE VS39 +#define ROTATE_ V7 +#define MASK VS40 +#define MASK_ V8 +#define FLIP VS41 +#define FLIP_ V9 + + +// func galMulPpc(low, high, in, out []byte) +TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96 + MOVD low+0(FP), LOW + MOVD high+24(FP), HIGH + MOVD in+48(FP), IN + MOVD in_len+56(FP), LEN + MOVD out+72(FP), OUT + + MOVD $16, OFFSET1 + MOVD $32, OFFSET2 + + MOVD $·constants(SB), CONSTANTS + LXVD2X (CONSTANTS)(R0), ROTATE + LXVD2X (CONSTANTS)(OFFSET1), MASK + LXVD2X (CONSTANTS)(OFFSET2), FLIP + + LXVD2X (LOW)(R0), X6 + LXVD2X (HIGH)(R0), X7 + VPERM X6_, V31, FLIP_, X6_ + VPERM X7_, V31, FLIP_, X7_ + + MOVD $0, OFFSET + +loop: + LXVD2X (IN)(OFFSET), MSG + + VSRB MSG_, ROTATE_, MSG_HI_ + VAND MSG_, MASK_, MSG_ + VPERM X6_, V31, MSG_, MSG_ + VPERM X7_, V31, MSG_HI_, MSG_HI_ + + VXOR MSG_, MSG_HI_, MSG_ + + STXVD2X MSG, (OUT)(OFFSET) + + ADD $16, OFFSET, OFFSET + CMP LEN, OFFSET + BGT loop + RET + + +// func galMulPpcXorlow, high, in, out []byte) +TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96 + MOVD low+0(FP), LOW + MOVD high+24(FP), HIGH + MOVD in+48(FP), IN + MOVD in_len+56(FP), LEN + MOVD out+72(FP), OUT + + MOVD $16, OFFSET1 + MOVD $32, OFFSET2 + + MOVD $·constants(SB), CONSTANTS + LXVD2X (CONSTANTS)(R0), ROTATE + LXVD2X (CONSTANTS)(OFFSET1), MASK + LXVD2X (CONSTANTS)(OFFSET2), FLIP + + LXVD2X (LOW)(R0), X6 + LXVD2X (HIGH)(R0), X7 + VPERM X6_, V31, FLIP_, X6_ + VPERM X7_, V31, FLIP_, X7_ + + MOVD $0, OFFSET + +loopXor: + LXVD2X (IN)(OFFSET), MSG + LXVD2X (OUT)(OFFSET), RESULT + + VSRB MSG_, ROTATE_, MSG_HI_ + VAND MSG_, MASK_, MSG_ + VPERM X6_, V31, MSG_, MSG_ + VPERM X7_, V31, MSG_HI_, MSG_HI_ + + VXOR MSG_, MSG_HI_, MSG_ + VXOR MSG_, RESULT_, RESULT_ + + STXVD2X RESULT, (OUT)(OFFSET) + + ADD $16, OFFSET, OFFSET + CMP LEN, OFFSET + BGT loopXor + RET + +DATA ·constants+0x0(SB)/8, $0x0404040404040404 +DATA ·constants+0x8(SB)/8, $0x0404040404040404 +DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f +DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f +DATA ·constants+0x20(SB)/8, $0x0706050403020100 +DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908 + +GLOBL ·constants(SB), 8, $48