Skip to content

Commit

Permalink
Make sure assembler is formatted (#145)
Browse files Browse the repository at this point in the history
* Make sure assembler is formatted
  • Loading branch information
klauspost authored May 14, 2020
1 parent 27f8a7b commit f338110
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 40 deletions.
13 changes: 10 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ go:
install:
- go get ./...

script:
script:
- go vet ./...
- go test -cpu=1,2 .
- go test -tags=noasm -cpu=1,2 .
Expand All @@ -29,18 +29,25 @@ script:
- go build examples/stream-decoder.go
- go build examples/stream-encoder.go

stages:
- gofmt
- test
- deploy

jobs:
allow_failures:
- go: 'master'
- arch: s390x
fast_finish: true
include:
- stage: gofmt
go: 1.14.x
os: linux
arch: amd64
script:
- diff <(gofmt -d .) <("")
- diff <(gofmt -d .) <(printf "")
- diff <(gofmt -d ./examples) <(printf "")
- go install github.com/klauspost/asmfmt/cmd/asmfmt
- diff <(asmfmt -d .) <(printf "")
- stage: race
go: 1.14.x
os: linux
Expand Down
28 changes: 14 additions & 14 deletions galoisAvx512_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
VPTERNLOGD $0x96, LO, HI, OUT

#define GALOIS(C1, C2, IN, LO, HI, OUT) \
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
GALOIS_MUL(LO, HI, LO, HI, OUT)

//
Expand Down Expand Up @@ -73,49 +73,49 @@ TEXT ·_galMulAVX512Parallel81(SB), 7, $0
loopback_avx512_parallel81:
VMOVDQU64.Z (DX), K1, Z4

LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS_MUL(Z16, Z20, Z14, Z15, Z4)

CMPQ AX, $1
JE skip_avx512_parallel81

LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS_MUL(Z24, Z28, Z14, Z15, Z4)

CMPQ AX, $2
JE skip_avx512_parallel81

LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS_MUL(Z17, Z21, Z14, Z15, Z4)

CMPQ AX, $3
JE skip_avx512_parallel81

LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS_MUL(Z25, Z29, Z14, Z15, Z4)

CMPQ AX, $4
JE skip_avx512_parallel81

LOAD(0x60) // &in[4][0]
LOAD(0x60) // &in[4][0]
GALOIS_MUL(Z18, Z22, Z14, Z15, Z4)

CMPQ AX, $5
JE skip_avx512_parallel81

LOAD(0x78) // &in[5][0]
LOAD(0x78) // &in[5][0]
GALOIS_MUL(Z26, Z30, Z14, Z15, Z4)

CMPQ AX, $6
JE skip_avx512_parallel81

LOAD(0x90) // &in[6][0]
LOAD(0x90) // &in[6][0]
GALOIS_MUL(Z19, Z23, Z14, Z15, Z4)

CMPQ AX, $7
JE skip_avx512_parallel81

LOAD(0xa8) // &in[7][0]
LOAD(0xa8) // &in[7][0]
GALOIS_MUL(Z27, Z31, Z14, Z15, Z4)

skip_avx512_parallel81:
Expand Down Expand Up @@ -191,28 +191,28 @@ loopback_avx512_parallel82:
VMOVDQU64.Z (DX), K1, Z4
VMOVDQU64.Z (CX), K1, Z5

LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS_MUL(Z16, Z24, Z14, Z15, Z4)
GALOIS_MUL(Z20, Z27, Z12, Z13, Z5)

CMPQ AX, $1
JE skip_avx512_parallel82

LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS_MUL(Z25, Z26, Z14, Z15, Z4)
GALOIS_MUL(Z28, Z29, Z12, Z13, Z5)

CMPQ AX, $2
JE skip_avx512_parallel82

LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS_MUL(Z17, Z30, Z14, Z15, Z4)
GALOIS_MUL(Z21, Z8, Z12, Z13, Z5)

CMPQ AX, $3
JE skip_avx512_parallel82

LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS_MUL(Z31, Z11, Z14, Z15, Z4)
GALOIS_MUL(Z9, Z10, Z12, Z13, Z5)

Expand Down
46 changes: 23 additions & 23 deletions galois_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@

// func galMulNEON(low, high, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5
SUBS $32, R2
BMI complete

MOVD low+0(FP), R10 // R10: &low
MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16]
Expand All @@ -22,7 +22,7 @@ TEXT ·galMulNEON(SB), 7, $0
//
MOVD $0x0f, R3
VMOV R3, V8.B[0]
VDUP V8.B[0], V8.B16
VDUP V8.B[0], V8.B16

loop:
// Main loop
Expand All @@ -31,8 +31,8 @@ loop:
// Get low input and high input
VUSHR $4, V0.B16, V10.B16
VUSHR $4, V1.B16, V11.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16

// Mul low part and mul high part
VTBL V0.B16, [V6.B16], V4.B16
Expand All @@ -41,8 +41,8 @@ loop:
VTBL V11.B16, [V7.B16], V15.B16

// Combine results
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16

// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
Expand All @@ -53,16 +53,15 @@ loop:
complete:
RET


// func galMulXorNEON(low, high, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5
SUBS $32, R2
BMI completeXor

MOVD low+0(FP), R10 // R10: &low
MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16]
Expand All @@ -73,7 +72,7 @@ TEXT ·galMulXorNEON(SB), 7, $0
//
MOVD $0x0f, R3
VMOV R3, V8.B[0]
VDUP V8.B[0], V8.B16
VDUP V8.B[0], V8.B16

loopXor:
// Main loop
Expand All @@ -83,8 +82,8 @@ loopXor:
// Get low input and high input
VUSHR $4, V0.B16, V10.B16
VUSHR $4, V1.B16, V11.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16

// Mul low part and mul high part
VTBL V0.B16, [V6.B16], V4.B16
Expand All @@ -93,10 +92,10 @@ loopXor:
VTBL V11.B16, [V7.B16], V15.B16

// Combine results
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
VEOR V20.B16, V4.B16, V4.B16
VEOR V21.B16, V5.B16, V5.B16
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
VEOR V20.B16, V4.B16, V4.B16
VEOR V21.B16, V5.B16, V5.B16

// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
Expand All @@ -109,8 +108,8 @@ completeXor:

// func galXorNEON(in, out []byte)
TEXT ·galXorNEON(SB), 7, $0
MOVD in_base+0(FP), R1
MOVD in_len+8(FP), R2 // length of message
MOVD in_base+0(FP), R1
MOVD in_len+8(FP), R2 // length of message
MOVD out_base+24(FP), R5
SUBS $32, R2
BMI completeXor
Expand All @@ -120,8 +119,8 @@ loopXor:
VLD1.P 32(R1), [V0.B16, V1.B16]
VLD1 (R5), [V20.B16, V21.B16]

VEOR V20.B16, V0.B16, V4.B16
VEOR V21.B16, V1.B16, V5.B16
VEOR V20.B16, V0.B16, V4.B16
VEOR V21.B16, V1.B16, V5.B16

// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
Expand All @@ -131,3 +130,4 @@ loopXor:

completeXor:
RET

0 comments on commit f338110

Please sign in to comment.