-
Notifications
You must be signed in to change notification settings - Fork 21
/
SFMT-neon.h
131 lines (122 loc) · 3.89 KB
/
SFMT-neon.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/**
* @file SFMT-neon.h
* @brief SIMD oriented Fast Mersenne Twister(SFMT) for ARM with 128b NEON
*
* @author Masaki Ota
*
* @note We assume LITTLE ENDIAN in this file
*/
#ifndef SFMT_NEON_H
#define SFMT_NEON_H
#if defined(HAVE_SHA3)
#if !defined(__ARM_FEATURE_SHA3)
/*
* SHA3 missing intrinsics
*/
__inline uint32x4_t veor3q_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
{
uint32x4_t r;
__asm__ ("eor3.16b %0,%1,%2,%3" :"=w"(r) :"w"(a), "w"(b), "w"(c));
return r;
}
__inline uint32x4_t vbcaxq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
{
uint32x4_t r;
__asm__ ("bcax.16b %0,%1,%2,%3" :"=w"(r) :"w"(a), "w"(b), "w"(c));
return r;
}
#endif
#define EOR3(a,b,c) veor3q_u32(a,b,c)
#define BCAX(a,b,c) vbcaxq_u32(a,b,c)
#else /* NO SHA3 */
/* emulate SHA3 */
#define EOR3(a,b,c) veorq_u32(a, veorq_u32(b,c))
#define BCAX(a,b,c) veorq_u32(a, vbicq_u32(b,c))
#endif
inline static void neon_recursion(uint32x4_t * r, uint32x4_t a, uint32x4_t b,
uint32x4_t c, uint32x4_t d);
/**
* This function represents the recursion formula.
* @param r an output
* @param a a 128-bit part of the interal state array
* @param b a 128-bit part of the interal state array
* @param c a 128-bit part of the interal state array
* @param d a 128-bit part of the interal state array
*/
inline static void neon_recursion(uint32x4_t * r, uint32x4_t a, uint32x4_t b,
uint32x4_t c, uint32x4_t d)
{
uint32x4_t v, x, y, z;
static const uint32x4_t vzero = {0,0,0,0};
static const uint32x4_t vmask_c = {~SFMT_MSK1, ~SFMT_MSK2, ~SFMT_MSK3, ~SFMT_MSK4};
#define rotate_bytes(A, B, C) vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(A),vreinterpretq_u8_u32(B),(C)))
x = rotate_bytes(vzero, a, 16-SFMT_SL2);
y = vshrq_n_u32(b, SFMT_SR1);
z = rotate_bytes(c, vzero, SFMT_SR2);
z = EOR3(x, a, z);
v = vshlq_n_u32(d, SFMT_SL1);
z = BCAX(z, y, vmask_c);
z = veorq_u32(z, v);
*r = z;
}
/**
* This function fills the internal state array with pseudorandom
* integers.
* @param sfmt SFMT internal state
*/
void sfmt_gen_rand_all(sfmt_t * sfmt) {
int i;
uint32x4_t r1, r2;
w128_t * pstate = sfmt->state;
r1 = pstate[SFMT_N - 2].si;
r2 = pstate[SFMT_N - 1].si;
for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
neon_recursion(&pstate[i].si, pstate[i].si, pstate[i + SFMT_POS1].si, r1, r2);
r1 = r2;
r2 = pstate[i].si;
}
for (; i < SFMT_N; i++) {
neon_recursion(&pstate[i].si, pstate[i].si, pstate[i + SFMT_POS1 - SFMT_N].si, r1, r2);
r1 = r2;
r2 = pstate[i].si;
}
}
/**
* This function fills the user-specified array with pseudorandom
* integers.
* @param sfmt SFMT internal state.
* @param array an 128-bit array to be filled by pseudorandom numbers.
* @param size number of 128-bit pseudorandom numbers to be generated.
*/
static void gen_rand_array(sfmt_t * sfmt, w128_t * array, int size)
{
int i, j;
uint32x4_t r1, r2;
w128_t * pstate = sfmt->state;
r1 = pstate[SFMT_N - 2].si;
r2 = pstate[SFMT_N - 1].si;
for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
neon_recursion(&array[i].si, pstate[i].si, pstate[i + SFMT_POS1].si, r1, r2);
r1 = r2;
r2 = array[i].si;
}
for (; i < SFMT_N; i++) {
neon_recursion(&array[i].si, pstate[i].si, array[i + SFMT_POS1 - SFMT_N].si, r1, r2);
r1 = r2;
r2 = array[i].si;
}
for (; i < size - SFMT_N; i++) {
neon_recursion(&array[i].si, array[i - SFMT_N].si, array[i + SFMT_POS1 - SFMT_N].si, r1, r2);
r1 = r2;
r2 = array[i].si;
}
for (j = 0; j < 2 * SFMT_N - size; j++) {
pstate[j] = array[j + size - SFMT_N];
}
for (; i < size; i++, j++) {
neon_recursion(&array[i].si, array[i - SFMT_N].si, array[i + SFMT_POS1 - SFMT_N].si, r1, r2);
r1 = r2;
r2 = pstate[j].si = array[i].si;
}
}
#endif