From 98195d7a6d2b1d57768db98a5ffc85ead1ff438a Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 9 May 2018 22:54:50 +0300 Subject: [PATCH] Optimize echo, whirlpool. Faster x16/x17/xevan --- kernel/wolf-echo.cl | 173 ++++++++++++-------------------------------- kernel/x16.cl | 137 +++++++++++------------------------ kernel/x17.cl | 32 +++----- kernel/xevan.cl | 10 +-- 4 files changed, 102 insertions(+), 250 deletions(-) diff --git a/kernel/wolf-echo.cl b/kernel/wolf-echo.cl index 9e85d249..087f0d40 100644 --- a/kernel/wolf-echo.cl +++ b/kernel/wolf-echo.cl @@ -19,97 +19,92 @@ uint4 Echo_AES_Round(const __local uint *AES0, const __local uint *AES1, const _ { uint4 Y; Y.s0 = AES0[(BYTE0(X.s0))] ^ AES1[BYTE1(X.s1)] ^ AES2[BYTE2(X.s2)] ^ AES3[BYTE3(X.s3)]; - Y.s1 = AES0[(BYTE0(X.s1))] ^ AES1[BYTE1(X.s2)] ^ AES2[BYTE2(X.s3)] ^ AES3[BYTE3(X.s0)]; - Y.s2 = AES0[(BYTE0(X.s2))] ^ AES1[BYTE1(X.s3)] ^ AES2[BYTE2(X.s0)] ^ AES3[BYTE3(X.s1)]; - Y.s3 = AES0[(BYTE0(X.s3))] ^ AES1[BYTE1(X.s0)] ^ AES2[BYTE2(X.s1)] ^ AES3[BYTE3(X.s2)]; - return(Y); + Y.s1 = AES0[(BYTE0(X.s1))] ^ AES1[BYTE1(X.s2)] ^ AES2[BYTE2(X.s3)] ^ AES3[BYTE3(X.s0)]; + Y.s2 = AES0[(BYTE0(X.s2))] ^ AES1[BYTE1(X.s3)] ^ AES2[BYTE2(X.s0)] ^ AES3[BYTE3(X.s1)]; + Y.s3 = AES0[(BYTE0(X.s3))] ^ AES1[BYTE1(X.s0)] ^ AES2[BYTE2(X.s1)] ^ AES3[BYTE3(X.s2)]; + return(Y); } uint4 Echo_AES_Round_Small(const __local uint *AES0, const uint4 X) { uint4 Y; Y.s0 = AES0[(BYTE0(X.s0))] ^ rotate(AES0[BYTE1(X.s1)], 8U) ^ rotate(AES0[BYTE2(X.s2)], 16U) ^ rotate(AES0[BYTE3(X.s3)], 24U); - Y.s1 = AES0[(BYTE0(X.s1))] ^ rotate(AES0[BYTE1(X.s2)], 8U) ^ rotate(AES0[BYTE2(X.s3)], 16U) ^ rotate(AES0[BYTE3(X.s0)], 24U); - Y.s2 = AES0[(BYTE0(X.s2))] ^ rotate(AES0[BYTE1(X.s3)], 8U) ^ rotate(AES0[BYTE2(X.s0)], 16U) ^ rotate(AES0[BYTE3(X.s1)], 24U); - Y.s3 = AES0[(BYTE0(X.s3))] ^ rotate(AES0[BYTE1(X.s0)], 8U) ^ rotate(AES0[BYTE2(X.s1)], 16U) ^ rotate(AES0[BYTE3(X.s2)], 24U); - return(Y); + Y.s1 = AES0[(BYTE0(X.s1))] ^ rotate(AES0[BYTE1(X.s2)], 8U) ^ rotate(AES0[BYTE2(X.s3)], 16U) ^ rotate(AES0[BYTE3(X.s0)], 24U); + Y.s2 = AES0[(BYTE0(X.s2))] ^ rotate(AES0[BYTE1(X.s3)], 8U) ^ rotate(AES0[BYTE2(X.s0)], 16U) ^ rotate(AES0[BYTE3(X.s1)], 24U); + Y.s3 = AES0[(BYTE0(X.s3))] ^ rotate(AES0[BYTE1(X.s0)], 8U) ^ rotate(AES0[BYTE2(X.s1)], 16U) ^ rotate(AES0[BYTE3(X.s2)], 24U); + return(Y); } -void BigSubBytesSmall(const __local uint *restrict AES0, uint4 *restrict W, uchar rnd) +void BigSubBytesSmall(const __local uint *restrict AES0, uint4 *restrict W, uint k0) { #pragma unroll for(int x = 0; x < 16; ++x) { uint4 tmp; tmp = Echo_AES_Round_Small(AES0, W[x]); - tmp.s0 ^= (rnd << 4) + x + 512; + tmp.s0 ^= k0 | x | 0x200; W[x] = Echo_AES_Round_Small(AES0, tmp); } } -void BigSubBytesSmall80(const __local uint *restrict AES0, uint4 *restrict W, uchar rnd) +void BigSubBytesSmall80(const __local uint *restrict AES0, uint4 *restrict W, uint k0) { #pragma unroll for(int x = 0; x < 16; ++x) { uint4 tmp; tmp = Echo_AES_Round_Small(AES0, W[x]); - tmp.s0 ^= (rnd << 4) + x + 640; + tmp.s0 ^= (k0 | x) + 0x280; W[x] = Echo_AES_Round_Small(AES0, tmp); } } -//#if defined(WOLF_X11_CL) -#if 0 -void BigSubBytes(const __local uint *restrict AES0, const __local uint *restrict AES1, const __local uint *restrict AES2, const __local uint *restrict AES3, uint *restrict W, uchar rnd) -#else -void BigSubBytes(const __local uint *restrict AES0, const __local uint *restrict AES1, const __local uint *restrict AES2, const __local uint *restrict AES3, uint4 *restrict W, uchar rnd) -#endif +void BigSubBytes0(const __local uint *restrict AES0, const __local uint *restrict AES1, const __local uint *restrict AES2, const __local uint *restrict AES3, uint4 *restrict W, uint k0) { - //#if defined(WOLF_X11_CL) - #if 0 - - #if defined(WOLF_X11_CL) - #pragma unroll 8 - #else - #pragma unroll 4 - #endif + #pragma unroll for(int x = 0; x < 16; ++x) { - const uint idx = x << 2; - uint tmp[4]; - - #if defined(WOLF_X11_CL) - #pragma unroll 2 - #else - #pragma unroll - #endif - for(int i = 0; i < 4; ++i) - tmp[i] = AES0[BYTE0(W[idx + i])] ^ AES1[BYTE1(W[idx + ((i + 1) & 3)])] ^ AES2[BYTE2(W[idx + ((i + 2) & 3)])] ^ AES3[BYTE3(W[idx + ((i + 3) & 3)])]; - - tmp[0] ^= (rnd << 4) + x + 512; - - #if defined(WOLF_X11_CL) - #pragma unroll 2 - #else - #pragma unroll - #endif - for(int i = 0; i < 4; ++i) - W[idx + i] = AES0[BYTE0(tmp[i])] ^ AES1[BYTE1(tmp[(i + 1) & 3])] ^ AES2[BYTE2(tmp[(i + 2) & 3])] ^ AES3[BYTE3(tmp[(i + 3) & 3])]; + uint4 tmp; + tmp = Echo_AES_Round(AES0, AES1, AES2, AES3, W[x]); + tmp.s0 ^= (k0 | x); + W[x] = Echo_AES_Round(AES0, AES1, AES2, AES3, tmp); } +} - #else +void BigSubBytes(const __local uint *restrict AES0, const __local uint *restrict AES1, const __local uint *restrict AES2, const __local uint *restrict AES3, uint4 *restrict W, uint k0) +{ + #pragma unroll + for(int x = 0; x < 16; ++x) + { + uint4 tmp; + tmp = Echo_AES_Round(AES0, AES1, AES2, AES3, W[x]); + tmp.s0 ^= k0 | x | 0x200; + W[x] = Echo_AES_Round(AES0, AES1, AES2, AES3, tmp); + } +} +void BigSubBytes80(const __local uint *restrict AES0, const __local uint *restrict AES1, const __local uint *restrict AES2, const __local uint *restrict AES3, uint4 *restrict W, uint k0) +{ #pragma unroll for(int x = 0; x < 16; ++x) { uint4 tmp; tmp = Echo_AES_Round(AES0, AES1, AES2, AES3, W[x]); - tmp.s0 ^= (rnd << 4) + x + 512; + tmp.s0 ^= (k0 | x) + 0x280; W[x] = Echo_AES_Round(AES0, AES1, AES2, AES3, tmp); } +} - #endif +void BigSubBytes128(const __local uint *restrict AES0, const __local uint *restrict AES1, const __local uint *restrict AES2, const __local uint *restrict AES3, uint4 *restrict W, uint k0) +{ + #pragma unroll + for(int x = 0; x < 16; ++x) + { + uint4 tmp; + tmp = Echo_AES_Round(AES0, AES1, AES2, AES3, W[x]); + tmp.s0 ^= (k0 | x) + 0x400; + W[x] = Echo_AES_Round(AES0, AES1, AES2, AES3, tmp); + } } void BigShiftRows(uint4 *WV) @@ -134,56 +129,12 @@ void BigShiftRows(uint4 *WV) WV[7] = tmp; } -uint NewIdx(uint absidx, uint shiftamt) -{ - return((absidx + (shiftamt << 2)) & 15); -} - -//#ifdef WOLF_X11_CL -#if 0 -void BigMixColumns(uint *W) -#else void BigMixColumns(uint4 *WV) -#endif { - //#ifdef WOLF_X11_CL - #if 0 - for(int y = 0; y < 64; y += 16) - { - #pragma unroll - for(int x = 0; x < 4; ++x) - { - const uint a = W[y + x]; - const uint b = W[y + x + 4]; - const uint c = W[y + x + 8]; - const uint d = W[y + x + 12]; - - const uint ab = a ^ b; - const uint bc = b ^ c; - const uint cd = c ^ d; - - const uint t1 = ab & 0x80808080U; - const uint t2 = bc & 0x80808080U; - const uint t3 = cd & 0x80808080U; - - const uint abx = ((t1 >> 7) * 27) ^ ((ab ^ t1) << 1); - const uint bcx = ((t2 >> 7) * 27) ^ ((bc ^ t2) << 1); - const uint cdx = ((t3 >> 7) * 27) ^ ((cd ^ t3) << 1); - - W[y + x] = abx ^ bc ^ d; - W[y + x + 4] = bcx ^ a ^ cd; - W[y + x + 8] = cdx ^ ab ^ d; - W[y + x + 12] = abx ^ bcx ^ cdx ^ ab ^ c; - } - } - - #else - #pragma unroll for(int x = 0; x < 16; x += 4) { const uint4 a = WV[x], b = WV[x + 1], c = WV[x + 2], d = WV[x + 3]; - //const uint4 a = WV[NewIdx(x + 0, 0)], b = WV[NewIdx(x + 1, 1)], c = WV[NewIdx(x + 2, 2)], d = WV[NewIdx(x + 3, 3)]; const uint4 ab = a ^ b; const uint4 bc = b ^ c; @@ -202,40 +153,6 @@ void BigMixColumns(uint4 *WV) WV[x + 2] = cdx ^ ab ^ d; WV[x + 3] = abx ^ bcx ^ cdx ^ ab ^ c; } - - /*uint4 a[4], b[4], c[4], d[4]; - - #pragma unroll - for(int x = 0; x < 16; x += 4) - { - a[x >> 2] = WV[NewIdx(x + 0, 0)]; - b[x >> 2] = WV[NewIdx(x + 1, 1)]; - c[x >> 2] = WV[NewIdx(x + 2, 2)]; - d[x >> 2] = WV[NewIdx(x + 3, 3)]; - } - - #pragma unroll - for(int x = 0; x < 16; x += 4) - { - const uint4 ab = a[x >> 2] ^ b[x >> 2]; - const uint4 bc = b[x >> 2] ^ c[x >> 2]; - const uint4 cd = c[x >> 2] ^ d[x >> 2]; - - const uint4 t1 = ab & 0x80808080U; - const uint4 t2 = bc & 0x80808080U; - const uint4 t3 = cd & 0x80808080U; - - const uint4 abx = ((t1 >> 7) * 27) ^ ((ab ^ t1) << 1); - const uint4 bcx = ((t2 >> 7) * 27) ^ ((bc ^ t2) << 1); - const uint4 cdx = ((t3 >> 7) * 27) ^ ((cd ^ t3) << 1); - - WV[x] = abx ^ bc ^ d[x >> 2]; - WV[x + 1] = bcx ^ a[x >> 2] ^ cd; - WV[x + 2] = cdx ^ ab ^ d[x >> 2]; - WV[x + 3] = abx ^ bcx ^ cdx ^ ab ^ c[x >> 2]; - }*/ - - #endif } -#endif +#endif \ No newline at end of file diff --git a/kernel/x16.cl b/kernel/x16.cl index 343740ba..b6af5c76 100644 --- a/kernel/x16.cl +++ b/kernel/x16.cl @@ -108,7 +108,6 @@ ulong ROTL64_2(const uint2 vv, const int r) { return as_ulong((amd_bitalign((vv) #include "wolf-shavite.cl" #include "wolf-aes.cl" #include "simd.cl" -#include "echo.cl" #include "wolf-echo.cl" #include "hamsi.cl" #include "fugue.cl" @@ -2021,8 +2020,8 @@ __kernel void search21(__global hash_t* hashes) barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll 1 - for(uchar i = 0; i < 10; ++i) { - BigSubBytesSmall(AES0, W, i); + for(uint k0 = 0; k0 < 160; k0 += 16) { + BigSubBytesSmall(AES0, W, k0); BigShiftRows(W); BigMixColumns(W); } @@ -2039,78 +2038,37 @@ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search22(__global ulong* block, __global hash_t* hashes) { uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); - int init = get_local_id(0); - int step = get_local_size(0); + __local uint AES0[256]; + for(int i = get_local_id(0), step = get_local_size(0); i < 256; i += step) + AES0[i] = AES0_C[i]; - for(int i = get_local_id(0); i < 256; i += step) - { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); - } + uint4 W[16]; + + #pragma unroll + for(int i = 0; i < 8; ++i) W[i] = (uint4)(512, 0, 0, 0); + + ((uint16 *)W)[2] = vload16(0, (__global uint *)block); + + W[12] = (uint4)(as_uint2(block[8]).s0, as_uint2(block[8]).s1, as_uint2(block[9]).s0, gid); + W[13] = (uint4)(0x80, 0, 0, 0); + W[14] = (uint4)(0, 0, 0, 0x2000000); + W[15] = (uint4)(0x280, 0, 0, 0); barrier(CLK_LOCAL_MEM_FENCE); - // echo - sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; - sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; - Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; - Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; - - sph_u32 K0 = 80 * 8; - sph_u32 K1 = 0; - sph_u32 K2 = 0; - sph_u32 K3 = 0; - - W00 = Vb00; - W01 = Vb01; - W10 = Vb10; - W11 = Vb11; - W20 = Vb20; - W21 = Vb21; - W30 = Vb30; - W31 = Vb31; - W40 = Vb40; - W41 = Vb41; - W50 = Vb50; - W51 = Vb51; - W60 = Vb60; - W61 = Vb61; - W70 = Vb70; - W71 = Vb71; - W80 = block[0]; - W81 = block[1]; - W90 = block[2]; - W91 = block[3]; - WA0 = block[4]; - WA1 = block[5]; - WB0 = block[6]; - WB1 = block[7]; - WC0 = block[8]; - WC1 = (block[9] & 0xffffffff) ^ ((ulong)gid << 32); - WD0 = 0x80; - WD1 = 0; - WE0 = 0; - WE1 = 0x200000000000000UL; - WF0 = 0x280; - WF1 = 0; - - for (unsigned u = 0; u < 10; u ++) - BIG_ROUND; - - hash->h8[0] = block[0] ^ Vb00 ^ W00 ^ W80; - hash->h8[1] = block[1] ^ Vb01 ^ W01 ^ W81; - hash->h8[2] = block[2] ^ Vb10 ^ W10 ^ W90; - hash->h8[3] = block[3] ^ Vb11 ^ W11 ^ W91; - hash->h8[4] = block[4] ^ Vb20 ^ W20 ^ WA0; - hash->h8[5] = block[5] ^ Vb21 ^ W21 ^ WA1; - hash->h8[6] = block[6] ^ Vb30 ^ W30 ^ WB0; - hash->h8[7] = block[7] ^ Vb31 ^ W31 ^ WB1; + #pragma unroll 1 + for(uint k0 = 0; k0 < 160; k0 += 16) { + BigSubBytesSmall80(AES0, W, k0); + BigShiftRows(W); + BigMixColumns(W); + } + + #pragma unroll + for(int i = 0; i < 4; ++i) + vstore4(vload4(i, (__global uint *)block) ^ W[i] ^ W[i + 8] ^ (uint4)(512, 0, 0, 0), i, hash->h4); barrier(CLK_GLOBAL_MEM_FENCE); } @@ -2717,22 +2675,13 @@ __kernel void search29(__global hash_t* hashes) h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; - n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; - n7 ^= h7; - #pragma unroll 10 for (unsigned r = 0; r < 10; r ++) { sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } @@ -2746,7 +2695,6 @@ __kernel void search29(__global hash_t* hashes) state[7] = n7 ^ (hash->h8[7]); n0 = 0x80; - n1 = n2 = n3 = n4 = n5 = n6 = 0; n7 = 0x2000000000000; h0 = state[0]; @@ -2759,12 +2707,12 @@ __kernel void search29(__global hash_t* hashes) h7 = state[7]; n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; + n1 = h1; + n2 = h2; + n3 = h3; + n4 = h4; + n5 = h5; + n6 = h6; n7 ^= h7; #pragma unroll 10 @@ -2773,7 +2721,7 @@ __kernel void search29(__global hash_t* hashes) ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } @@ -2835,13 +2783,11 @@ __kernel void search30(__global ulong* block, __global hash_t* hashes) h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; #pragma unroll 10 - for (unsigned r = 0; r < 10; r ++) - { + for (unsigned r = 0; r < 10; r ++) { sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } @@ -2881,10 +2827,9 @@ __kernel void search30(__global ulong* block, __global hash_t* hashes) #pragma unroll 10 for (unsigned r = 0; r < 10; r ++) { sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } diff --git a/kernel/x17.cl b/kernel/x17.cl index ef38b8a4..e168c192 100644 --- a/kernel/x17.cl +++ b/kernel/x17.cl @@ -1103,8 +1103,8 @@ __kernel void search10(__global hash_t* hashes) barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll 1 - for(uchar i = 0; i < 10; ++i) { - BigSubBytesSmall(AES0, W, i); + for(uint k0 = 0; k0 < 160; k0 += 16) { + BigSubBytesSmall(AES0, W, k0); BigShiftRows(W); BigMixColumns(W); } @@ -1418,22 +1418,13 @@ __kernel void search14(__global hash_t* hashes) h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; - n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; - n7 ^= h7; - #pragma unroll 10 for (unsigned r = 0; r < 10; r ++) { sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } @@ -1447,7 +1438,6 @@ __kernel void search14(__global hash_t* hashes) state[7] = n7 ^ (hash->h8[7]); n0 = 0x80; - n1 = n2 = n3 = n4 = n5 = n6 = 0; n7 = 0x2000000000000; h0 = state[0]; @@ -1460,12 +1450,12 @@ __kernel void search14(__global hash_t* hashes) h7 = state[7]; n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; + n1 = h1; + n2 = h2; + n3 = h3; + n4 = h4; + n5 = h5; + n6 = h6; n7 ^= h7; #pragma unroll 10 @@ -1474,7 +1464,7 @@ __kernel void search14(__global hash_t* hashes) ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } diff --git a/kernel/xevan.cl b/kernel/xevan.cl index bf111785..9bb2306e 100644 --- a/kernel/xevan.cl +++ b/kernel/xevan.cl @@ -1438,9 +1438,9 @@ void whirlpoolkernel(__global hash_t *hash, __local sph_u64 LT0[256], __local sp { sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } @@ -1478,9 +1478,9 @@ void whirlpoolkernel(__global hash_t *hash, __local sph_u64 LT0[256], __local sp { sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); } @@ -1522,7 +1522,7 @@ void whirlpoolkernel(__global hash_t *hash, __local sph_u64 LT0[256], __local sp ROUND_KSCHED(LT, h, tmp, plain_RC[r]); TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); + ROUND_WENC(LT, n, h, tmp); TRANSFER(n, tmp); }