diff --git a/arm/curve25519/bignum_mod_n25519.S b/arm/curve25519/bignum_mod_n25519.S new file mode 100644 index 0000000000..5a256ed133 --- /dev/null +++ b/arm/curve25519/bignum_mod_n25519.S @@ -0,0 +1,186 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n25519 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the order of the curve25519/edwards25519 basepoint, +// which is n_25519 = 2^252 + 27742317777372353535851937790883648493 +// +// Standard ARM ABI: X0 = z, X1 = k, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519) + .text + .balign 4 + +#define z x0 +#define k x1 +#define x x2 + +#define m0 x3 +#define m1 x4 +#define m2 x5 +#define m3 x6 + +#define t0 x7 +#define t1 x8 +#define t2 x9 +#define t3 x10 + +#define n0 x11 +#define n1 x12 + +// These two are aliased: we only load d when finished with q + +#define q x13 +#define d x13 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_n25519): + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmp k, #4 + bcc short + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 +// This [m3;m2;m1;m0] is the initial x where we begin reduction. + + sub k, k, #4 + lsl t0, k, #3 + add t0, t0, x + ldp m2, m3, [t0, #16] + ldp m0, m1, [t0] + +// Load the complicated two words of n_25519 = 2^252 + [n1; n0] + + movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed) + movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6) + +// Get the quotient estimate q = floor(x/2^252). +// Also delete it from m3, in effect doing x' = x - q * 2^252 + + lsr q, m3, #60 + and m3, m3, #0x0FFFFFFFFFFFFFFF + +// Multiply [t2;t1;t0] = q * [n1;n0] + + mul t0, n0, q + mul t1, n1, q + umulh t2, n0, q + adds t1, t1, t2 + umulh t2, n1, q + adc t2, t2, xzr + +// Subtract [m3;m2;m1;m0] = x' - q * [n1;n0] = x - q * n_25519 + + subs m0, m0, t0 + sbcs m1, m1, t1 + sbcs m2, m2, t2 + sbcs m3, m3, xzr + +// If this borrows (CF = 0 because of inversion), add back n_25519. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + csel t0, n0, xzr, cc + csel t1, n1, xzr, cc + adds m0, m0, t0 + adcs m1, m1, t1 + and t0, t0, #0x1000000000000000 + adcs m2, m2, xzr + adc m3, m3, t0 + +// Now do (k-4) iterations of 5->4 word modular reduction. Each one +// is similar to the sequence above except for the more refined quotient +// estimation process. + + cbz k, writeback + +loop: + +// Assume that the new 5-digit x is 2^64 * previous_x + next_digit. +// Get the quotient estimate q = max (floor(x/2^252)) (2^64 - 1) +// and first compute x' = x - 2^252 * q. + + extr q, m3, m2, #60 + and m2, m2, #0x0FFFFFFFFFFFFFFF + sub q, q, m3, lsr #60 + and m3, m3, #0xF000000000000000 + add m2, m2, m3 + +// Multiply [t2;t1;t0] = q * [n1;n0] + + mul t0, n0, q + mul t1, n1, q + umulh t2, n0, q + adds t1, t1, t2 + umulh t2, n1, q + adc t2, t2, xzr + +// Decrement k and load the next digit (note that d aliases to q) + + sub k, k, #1 + ldr d, [x, k, lsl #3] + +// Subtract [t3;t2;t1;t0] = x' - q * [n1;n0] = x - q * n_25519 + + subs t0, d, t0 + sbcs t1, m0, t1 + sbcs t2, m1, t2 + sbcs t3, m2, xzr + +// If this borrows (CF = 0 because of inversion), add back n_25519. +// The masked n3 digit exploits the fact that bit 60 of n1 is set. + + csel m0, n0, xzr, cc + csel m1, n1, xzr, cc + adds m0, t0, m0 + and m3, m1, #0x1000000000000000 + adcs m1, t1, m1 + adcs m2, t2, xzr + adc m3, t3, m3 + + cbnz k, loop + +// Finally write back [m3;m2;m1;m0] and return + +writeback: + stp m0, m1, [z] + stp m2, m3, [z, #16] + ret + +// Short case: just copy the input with zero-padding + +short: + mov m0, xzr + mov m1, xzr + mov m2, xzr + mov m3, xzr + + cbz k, writeback + ldr m0, [x] + subs k, k, #1 + beq writeback + ldr m1, [x, #8] + subs k, k, #1 + beq writeback + ldr m2, [x, #16] + b writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 0b7ec7a111..7514dac33a 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -850,356 +850,1046 @@ curve25519_x25519_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_wmontend -curve25519_x25519_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wmontloop -curve25519_x25519_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_zmontend -curve25519_x25519_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zmontloop -curve25519_x25519_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_negskip1 -curve25519_x25519_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_negloop1 -curve25519_x25519_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_negskip2 -curve25519_x25519_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_negloop2 -curve25519_x25519_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_invmidloop +curve25519_x25519_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1222,7 +1912,7 @@ curve25519_x25519_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/arm/curve25519/curve25519_x25519_alt.S b/arm/curve25519/curve25519_x25519_alt.S index 3a521a602f..261b82c90a 100644 --- a/arm/curve25519/curve25519_x25519_alt.S +++ b/arm/curve25519/curve25519_x25519_alt.S @@ -634,356 +634,1046 @@ curve25519_x25519_alt_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_alt_wmontend -curve25519_x25519_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wmontloop -curve25519_x25519_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_alt_zmontend -curve25519_x25519_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zmontloop -curve25519_x25519_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_alt_negskip1 -curve25519_x25519_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_alt_negloop1 -curve25519_x25519_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_alt_negskip2 -curve25519_x25519_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_alt_negloop2 -curve25519_x25519_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_alt_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_alt_invmidloop +curve25519_x25519_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1006,7 +1696,7 @@ curve25519_x25519_alt_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 6162a38082..7837118421 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -968,356 +968,1046 @@ curve25519_x25519_byte_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_byte_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_byte_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_byte_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_byte_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_wmontend -curve25519_x25519_byte_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wmontloop -curve25519_x25519_byte_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_byte_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_zmontend -curve25519_x25519_byte_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zmontloop -curve25519_x25519_byte_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_byte_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_negskip1 -curve25519_x25519_byte_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_negloop1 -curve25519_x25519_byte_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_negskip2 -curve25519_x25519_byte_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_negloop2 -curve25519_x25519_byte_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_byte_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_byte_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_byte_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_byte_invmidloop +curve25519_x25519_byte_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_byte_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_byte_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1340,7 +2030,7 @@ curve25519_x25519_byte_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(zn,xn,zm) + mul_p25519(zn,xn,xm) ldp x10, x11, [zn] strb w10, [resx] diff --git a/arm/curve25519/curve25519_x25519_byte_alt.S b/arm/curve25519/curve25519_x25519_byte_alt.S index f59e611467..6523822d2c 100644 --- a/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/arm/curve25519/curve25519_x25519_byte_alt.S @@ -752,356 +752,1046 @@ curve25519_x25519_byte_alt_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_byte_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_byte_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_byte_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_byte_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_alt_wmontend -curve25519_x25519_byte_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wmontloop -curve25519_x25519_byte_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_byte_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_alt_zmontend -curve25519_x25519_byte_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zmontloop -curve25519_x25519_byte_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_byte_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_alt_negskip1 -curve25519_x25519_byte_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_alt_negloop1 -curve25519_x25519_byte_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_alt_negskip2 -curve25519_x25519_byte_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_alt_negloop2 -curve25519_x25519_byte_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_byte_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_byte_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_byte_alt_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_byte_alt_invmidloop +curve25519_x25519_byte_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_byte_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_byte_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1124,7 +1814,7 @@ curve25519_x25519_byte_alt_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(zn,xn,zm) + mul_p25519(zn,xn,xm) ldp x10, x11, [zn] strb w10, [resx] diff --git a/arm/curve25519/curve25519_x25519base.S b/arm/curve25519/curve25519_x25519base.S index 030fa08e24..b9c3b8e34a 100644 --- a/arm/curve25519/curve25519_x25519base.S +++ b/arm/curve25519/curve25519_x25519base.S @@ -907,360 +907,1058 @@ curve25519_x25519base_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_wmontend -curve25519_x25519base_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wmontloop -curve25519_x25519base_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_zmontend -curve25519_x25519base_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zmontloop -curve25519_x25519base_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_negskip1 -curve25519_x25519base_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_negloop1 -curve25519_x25519base_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_negskip2 -curve25519_x25519base_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_negloop2 -curve25519_x25519base_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_invmidloop +curve25519_x25519base_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1279,14 +1977,6 @@ curve25519_x25519base_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/curve25519_x25519base_alt.S b/arm/curve25519/curve25519_x25519base_alt.S index 97d2e9c54f..22de69f4c3 100644 --- a/arm/curve25519/curve25519_x25519base_alt.S +++ b/arm/curve25519/curve25519_x25519base_alt.S @@ -749,360 +749,1058 @@ curve25519_x25519base_alt_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_alt_wmontend -curve25519_x25519base_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wmontloop -curve25519_x25519base_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_alt_zmontend -curve25519_x25519base_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zmontloop -curve25519_x25519base_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_alt_negskip1 -curve25519_x25519base_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_alt_negloop1 -curve25519_x25519base_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_alt_negskip2 -curve25519_x25519base_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_alt_negloop2 -curve25519_x25519base_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_alt_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_alt_invmidloop +curve25519_x25519base_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1121,14 +1819,6 @@ curve25519_x25519base_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/curve25519_x25519base_byte.S b/arm/curve25519/curve25519_x25519base_byte.S index b6d95f58c9..aecc693c66 100644 --- a/arm/curve25519/curve25519_x25519base_byte.S +++ b/arm/curve25519/curve25519_x25519base_byte.S @@ -966,360 +966,1058 @@ curve25519_x25519base_byte_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_byte_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_byte_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_byte_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_byte_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_byte_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_wmontend -curve25519_x25519base_byte_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wmontloop -curve25519_x25519base_byte_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_byte_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_zmontend -curve25519_x25519base_byte_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zmontloop -curve25519_x25519base_byte_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_byte_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_negskip1 -curve25519_x25519base_byte_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_negloop1 -curve25519_x25519base_byte_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_negskip2 -curve25519_x25519base_byte_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_negloop2 -curve25519_x25519base_byte_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_byte_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_byte_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_byte_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_byte_invmidloop +curve25519_x25519base_byte_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_byte_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_byte_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(x_1,y_3,x_3) + mul_p25519(x_1,t1,t0) ldp x10, x11, [x_1] strb w10, [resx] @@ -1405,14 +2103,6 @@ curve25519_x25519base_byte_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_byte_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/curve25519_x25519base_byte_alt.S b/arm/curve25519/curve25519_x25519base_byte_alt.S index 6e61199732..9c9dca518c 100644 --- a/arm/curve25519/curve25519_x25519base_byte_alt.S +++ b/arm/curve25519/curve25519_x25519base_byte_alt.S @@ -805,363 +805,1059 @@ curve25519_x25519base_byte_alt_scalarloop: // the Montgomery point at infinity, and Edwards (0,-1) which maps to // Montgomery (0,0) [this is the 2-torsion point] are both by definition // mapped to 0 by the X coordinate mapping used to define curve25519. -// -// First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_byte_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_byte_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_byte_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_byte_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_byte_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_alt_wmontend -curve25519_x25519base_byte_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wmontloop -curve25519_x25519base_byte_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_byte_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_alt_zmontend -curve25519_x25519base_byte_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zmontloop -curve25519_x25519base_byte_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_byte_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_alt_negskip1 -curve25519_x25519base_byte_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_alt_negloop1 -curve25519_x25519base_byte_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_alt_negskip2 -curve25519_x25519base_byte_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_alt_negloop2 -curve25519_x25519base_byte_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_byte_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_byte_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_byte_alt_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_byte_alt_invmidloop +curve25519_x25519base_byte_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_byte_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_byte_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(x_1,y_3,x_3) + mul_p25519(x_1,t1,t0) ldp x10, x11, [x_1] strb w10, [resx] @@ -1229,6 +1925,7 @@ curve25519_x25519base_byte_alt_zfliploop: lsr x13, x13, #8 strb w13, [resx+31] + // Restore stack and registers add sp, sp, #NSPACE @@ -1246,14 +1943,6 @@ curve25519_x25519base_byte_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_byte_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_decode.S b/arm/curve25519/edwards25519_decode.S new file mode 100644 index 0000000000..9161768db7 --- /dev/null +++ b/arm/curve25519/edwards25519_decode.S @@ -0,0 +1,700 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard ARM ABI: X0 = z, X1 = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y sp, #0 +#define s sp, #(4*N) +#define t sp, #(8*N) +#define u sp, #(12*N) +#define v sp, #(16*N) +#define w sp, #(20*N) + +// Other temporary variables in register + +#define res x19 +#define sgnbit x20 +#define badun x21 + +// Total size to reserve on the stack + +#define NSPACE #(24*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest; \ + add x1, src1; \ + add x2, src2; \ + bl edwards25519_decode_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest; \ + mov x1, n; \ + add x2, src; \ + bl edwards25519_decode_nsqr_p25519 + +S2N_BN_SYMBOL(edwards25519_decode): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Load the inputs, using byte operations in case of big-endian setting. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + ldrb w0, [x1] + lsl x4, x0, #56 + ldrb w0, [x1, #1] + extr x4, x0, x4, #8 + ldrb w0, [x1, #2] + extr x4, x0, x4, #8 + ldrb w0, [x1, #3] + extr x4, x0, x4, #8 + ldrb w0, [x1, #4] + extr x4, x0, x4, #8 + ldrb w0, [x1, #5] + extr x4, x0, x4, #8 + ldrb w0, [x1, #6] + extr x4, x0, x4, #8 + ldrb w0, [x1, #7] + extr x4, x0, x4, #8 + + ldrb w0, [x1, #8] + lsl x5, x0, #56 + ldrb w0, [x1, #9] + extr x5, x0, x5, #8 + ldrb w0, [x1, #10] + extr x5, x0, x5, #8 + ldrb w0, [x1, #11] + extr x5, x0, x5, #8 + ldrb w0, [x1, #12] + extr x5, x0, x5, #8 + ldrb w0, [x1, #13] + extr x5, x0, x5, #8 + ldrb w0, [x1, #14] + extr x5, x0, x5, #8 + ldrb w0, [x1, #15] + extr x5, x0, x5, #8 + + ldrb w0, [x1, #16] + lsl x6, x0, #56 + ldrb w0, [x1, #17] + extr x6, x0, x6, #8 + ldrb w0, [x1, #18] + extr x6, x0, x6, #8 + ldrb w0, [x1, #19] + extr x6, x0, x6, #8 + ldrb w0, [x1, #20] + extr x6, x0, x6, #8 + ldrb w0, [x1, #21] + extr x6, x0, x6, #8 + ldrb w0, [x1, #22] + extr x6, x0, x6, #8 + ldrb w0, [x1, #23] + extr x6, x0, x6, #8 + + ldrb w0, [x1, #24] + lsl x7, x0, #56 + ldrb w0, [x1, #25] + extr x7, x0, x7, #8 + ldrb w0, [x1, #26] + extr x7, x0, x7, #8 + ldrb w0, [x1, #27] + extr x7, x0, x7, #8 + ldrb w0, [x1, #28] + extr x7, x0, x7, #8 + ldrb w0, [x1, #29] + extr x7, x0, x7, #8 + ldrb w0, [x1, #30] + extr x7, x0, x7, #8 + ldrb w0, [x1, #31] + extr x7, x0, x7, #8 + + stp x4, x5, [y] + lsr sgnbit, x7, #63 + and x7, x7, #0x7FFFFFFFFFFFFFFF + stp x6, x7, [y+16] + + adds xzr, x4, #19 + adcs xzr, x5, xzr + adcs xzr, x6, xzr + adcs xzr, x7, xzr + cset badun, mi + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + nsqr(v,1,y) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + mov x4, #0x8000000000000000 + subs x0, x0, #20 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x3, x3, x4 + stp x0, x1, [u] + stp x2, x3, [u+16] + + movbig(x0,#0x75eb,#0x4dca,#0x1359,#0x78a3) + movbig(x1,#0x0070,#0x0a4d,#0x4141,#0xd8ab) + movbig(x2,#0x8cc7,#0x4079,#0x7779,#0xe898) + movbig(x3,#0x5203,#0x6cee,#0x2b6f,#0xfe73) + stp x0, x1, [w] + stp x2, x3, [w+16] + mulp(v,w,v) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + adds x0, x0, #1 + adcs x1, x1, xzr + adcs x2, x2, xzr + adcs x3, x3, xzr + stp x0, x1, [v] + stp x2, x3, [v+16] + + mulp(w,u,v) + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + nsqr(t,1,w) + mulp(t,t,w) + nsqr(s,2,t) + mulp(t,s,t) + nsqr(s,1,t) + mulp(v,s,w) + nsqr(s,5,v) + mulp(t,s,v) + nsqr(s,10,t) + mulp(t,s,t) + nsqr(s,5,t) + mulp(v,s,v) + nsqr(s,25,v) + mulp(t,s,v) + nsqr(s,50,t) + mulp(t,s,t) + nsqr(s,25,t) + mulp(v,s,v) + nsqr(s,125,v) + mulp(v,s,v) + nsqr(s,2,v) + mulp(s,s,w) + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + nsqr(v,1,s) + mulp(v,v,w) + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + mulp(s,u,s) + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// x4 = 0 <=> s^2 * w = 0 or 1 + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + bic x4, x0, #1 + orr x4, x4, x1 + orr x5, x2, x3 + orr x4, x4, x5 + +// x0 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + add x0, x0, #20 + add x1, x1, #1 + orr x0, x0, x1 + add x2, x2, #1 + eor x3, x3, #0x7FFFFFFFFFFFFFFF + orr x2, x2, x3 + orr x0, x0, x2 + +// If s^2 * w is not 0 or 1 then replace s by t + + cmp x4, xzr + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + stp x10, x11, [s] + stp x12, x13, [s+16] + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + ccmp x0, xzr, 4, ne + cset x0, ne + orr badun, badun, x0 + +// Let [x3;x2;x1;x0] = s and [x7;x6;x5;x4] = p_25519 - s + + ldp x0, x1, [s] + ldp x2, x3, [s+16] + mov x4, #-19 + subs x4, x4, x0 + mov x6, #-1 + sbcs x5, x6, x1 + sbcs x6, x6, x2 + mov x7, #0x7FFFFFFFFFFFFFFF + sbc x7, x7, x3 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + and x9, x0, #1 + eor sgnbit, x9, sgnbit + orr x8, x0, x1 + orr x9, x2, x3 + orr x8, x8, x9 + orr x10, badun, sgnbit + cmp x8, xzr + csel badun, x10, badun, eq + ccmp sgnbit, xzr, #4, ne + +// Actual selection of x as s or -s, copying of y and return of validity + + csel x0, x0, x4, eq + csel x1, x1, x5, eq + csel x2, x2, x6, eq + csel x3, x3, x7, eq + ldp x8, x9, [y] + ldp x10, x11, [y+16] + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x8, x9, [res, #32] + stp x10, x11, [res, #48] + + mov x0, badun + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_mul_p25519: + ldp x3, x4, [x1] + ldp x5, x6, [x2] + umull x7, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x8, w16, w17 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [x1, #16] + ldp x5, x6, [x2, #16] + umull x11, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x12, w16, w17 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [x1, #16] + ldp x15, x16, [x1] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, lo + ldp x15, x17, [x2] + subs x5, x15, x5 + sbcs x6, x17, x6 + csetm x17, lo + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x17 + subs x5, x5, x17 + eor x6, x6, x17 + sbc x6, x6, x17 + eor x16, x17, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x17, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x9, lo + adds x15, x15, x17 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, lo + cinv x9, x9, lo + mul x5, x4, x6 + umulh x6, x4, x6 + adds x17, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #1 + eor x5, x5, x9 + adcs x17, x5, x17 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x17, x17, x16 + adcs x10, x17, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #38 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x17, x14, #31 + mov x5, #19 + umaddl x5, w5, w17, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x17, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #19 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [x0] + stp x9, x10, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_nsqr_p25519: + +// Copy input argument into [x13;x12;x11;x10] + + ldp x10, x11, [x2] + ldp x12, x13, [x2, #16] + +// Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_loop: + umull x2, w10, w10 + lsr x14, x10, #32 + umull x3, w14, w14 + umull x14, w10, w14 + adds x2, x2, x14, lsl #33 + lsr x14, x14, #31 + adc x3, x3, x14 + umull x4, w11, w11 + lsr x14, x11, #32 + umull x5, w14, w14 + umull x14, w11, w14 + mul x15, x10, x11 + umulh x16, x10, x11 + adds x4, x4, x14, lsl #33 + lsr x14, x14, #31 + adc x5, x5, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x5, x5, xzr + adds x3, x3, x15 + adcs x4, x4, x16 + adc x5, x5, xzr + umull x6, w12, w12 + lsr x14, x12, #32 + umull x7, w14, w14 + umull x14, w12, w14 + adds x6, x6, x14, lsl #33 + lsr x14, x14, #31 + adc x7, x7, x14 + umull x8, w13, w13 + lsr x14, x13, #32 + umull x9, w14, w14 + umull x14, w13, w14 + mul x15, x12, x13 + umulh x16, x12, x13 + adds x8, x8, x14, lsl #33 + lsr x14, x14, #31 + adc x9, x9, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x9, x9, xzr + adds x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, xzr + subs x10, x10, x12 + sbcs x11, x11, x13 + csetm x16, lo + eor x10, x10, x16 + subs x10, x10, x16 + eor x11, x11, x16 + sbc x11, x11, x16 + adds x6, x6, x4 + adcs x7, x7, x5 + adcs x8, x8, xzr + adc x9, x9, xzr + umull x12, w10, w10 + lsr x5, x10, #32 + umull x13, w5, w5 + umull x5, w10, w5 + adds x12, x12, x5, lsl #33 + lsr x5, x5, #31 + adc x13, x13, x5 + umull x15, w11, w11 + lsr x5, x11, #32 + umull x14, w5, w5 + umull x5, w11, w5 + mul x4, x10, x11 + umulh x16, x10, x11 + adds x15, x15, x5, lsl #33 + lsr x5, x5, #31 + adc x14, x14, x5 + adds x4, x4, x4 + adcs x16, x16, x16 + adc x14, x14, xzr + adds x13, x13, x4 + adcs x15, x15, x16 + adc x14, x14, xzr + adds x4, x2, x6 + adcs x5, x3, x7 + adcs x6, x6, x8 + adcs x7, x7, x9 + csetm x16, lo + subs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x15 + sbcs x7, x7, x14 + adcs x8, x8, x16 + adc x9, x9, x16 + mov x10, #38 + umull x12, w6, w10 + add x12, x12, w2, uxtw + lsr x2, x2, #32 + lsr x6, x6, #32 + umaddl x6, w6, w10, x2 + mov x2, x12 + umull x12, w7, w10 + add x12, x12, w3, uxtw + lsr x3, x3, #32 + lsr x7, x7, #32 + umaddl x7, w7, w10, x3 + mov x3, x12 + umull x12, w8, w10 + add x12, x12, w4, uxtw + lsr x4, x4, #32 + lsr x8, x8, #32 + umaddl x8, w8, w10, x4 + mov x4, x12 + umull x12, w9, w10 + add x12, x12, w5, uxtw + lsr x5, x5, #32 + lsr x9, x9, #32 + umaddl x9, w9, w10, x5 + mov x5, x12 + lsr x13, x9, #31 + mov x11, #19 + umull x11, w11, w13 + add x2, x2, x11 + adds x10, x2, x6, lsl #32 + extr x12, x7, x6, #32 + adcs x11, x3, x12 + extr x12, x8, x7, #32 + adcs x12, x4, x12 + extr x14, x9, x8, #32 + lsl x15, x13, #63 + eor x5, x5, x15 + adc x13, x5, x14 + +// Loop as applicable + + subs x1, x1, #1 + bne edwards25519_decode_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x10, #19 + adcs x7, x11, xzr + adcs x8, x12, xzr + adcs x9, x13, xzr + + csel x10, x10, x6, pl + csel x11, x11, x7, pl + csel x12, x12, x8, pl + csel x13, x13, x9, pl + bic x13, x13, #0x8000000000000000 + +// Copy result back into destination and return + + stp x10, x11, [x0] + stp x12, x13, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/curve25519/edwards25519_decode_alt.S b/arm/curve25519/edwards25519_decode_alt.S new file mode 100644 index 0000000000..c77a191744 --- /dev/null +++ b/arm/curve25519/edwards25519_decode_alt.S @@ -0,0 +1,563 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard ARM ABI: X0 = z, X1 = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode_alt) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y sp, #0 +#define s sp, #(4*N) +#define t sp, #(8*N) +#define u sp, #(12*N) +#define v sp, #(16*N) +#define w sp, #(20*N) + +// Other temporary variables in register + +#define res x19 +#define sgnbit x20 +#define badun x21 + +// Total size to reserve on the stack + +#define NSPACE #(24*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest; \ + add x1, src1; \ + add x2, src2; \ + bl edwards25519_decode_alt_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest; \ + mov x1, n; \ + add x2, src; \ + bl edwards25519_decode_alt_nsqr_p25519 + +S2N_BN_SYMBOL(edwards25519_decode_alt): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Load the inputs, using byte operations in case of big-endian setting. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + ldrb w0, [x1] + lsl x4, x0, #56 + ldrb w0, [x1, #1] + extr x4, x0, x4, #8 + ldrb w0, [x1, #2] + extr x4, x0, x4, #8 + ldrb w0, [x1, #3] + extr x4, x0, x4, #8 + ldrb w0, [x1, #4] + extr x4, x0, x4, #8 + ldrb w0, [x1, #5] + extr x4, x0, x4, #8 + ldrb w0, [x1, #6] + extr x4, x0, x4, #8 + ldrb w0, [x1, #7] + extr x4, x0, x4, #8 + + ldrb w0, [x1, #8] + lsl x5, x0, #56 + ldrb w0, [x1, #9] + extr x5, x0, x5, #8 + ldrb w0, [x1, #10] + extr x5, x0, x5, #8 + ldrb w0, [x1, #11] + extr x5, x0, x5, #8 + ldrb w0, [x1, #12] + extr x5, x0, x5, #8 + ldrb w0, [x1, #13] + extr x5, x0, x5, #8 + ldrb w0, [x1, #14] + extr x5, x0, x5, #8 + ldrb w0, [x1, #15] + extr x5, x0, x5, #8 + + ldrb w0, [x1, #16] + lsl x6, x0, #56 + ldrb w0, [x1, #17] + extr x6, x0, x6, #8 + ldrb w0, [x1, #18] + extr x6, x0, x6, #8 + ldrb w0, [x1, #19] + extr x6, x0, x6, #8 + ldrb w0, [x1, #20] + extr x6, x0, x6, #8 + ldrb w0, [x1, #21] + extr x6, x0, x6, #8 + ldrb w0, [x1, #22] + extr x6, x0, x6, #8 + ldrb w0, [x1, #23] + extr x6, x0, x6, #8 + + ldrb w0, [x1, #24] + lsl x7, x0, #56 + ldrb w0, [x1, #25] + extr x7, x0, x7, #8 + ldrb w0, [x1, #26] + extr x7, x0, x7, #8 + ldrb w0, [x1, #27] + extr x7, x0, x7, #8 + ldrb w0, [x1, #28] + extr x7, x0, x7, #8 + ldrb w0, [x1, #29] + extr x7, x0, x7, #8 + ldrb w0, [x1, #30] + extr x7, x0, x7, #8 + ldrb w0, [x1, #31] + extr x7, x0, x7, #8 + + stp x4, x5, [y] + lsr sgnbit, x7, #63 + and x7, x7, #0x7FFFFFFFFFFFFFFF + stp x6, x7, [y+16] + + adds xzr, x4, #19 + adcs xzr, x5, xzr + adcs xzr, x6, xzr + adcs xzr, x7, xzr + cset badun, mi + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + nsqr(v,1,y) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + mov x4, #0x8000000000000000 + subs x0, x0, #20 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x3, x3, x4 + stp x0, x1, [u] + stp x2, x3, [u+16] + + movbig(x0,#0x75eb,#0x4dca,#0x1359,#0x78a3) + movbig(x1,#0x0070,#0x0a4d,#0x4141,#0xd8ab) + movbig(x2,#0x8cc7,#0x4079,#0x7779,#0xe898) + movbig(x3,#0x5203,#0x6cee,#0x2b6f,#0xfe73) + stp x0, x1, [w] + stp x2, x3, [w+16] + mulp(v,w,v) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + adds x0, x0, #1 + adcs x1, x1, xzr + adcs x2, x2, xzr + adcs x3, x3, xzr + stp x0, x1, [v] + stp x2, x3, [v+16] + + mulp(w,u,v) + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + nsqr(t,1,w) + mulp(t,t,w) + nsqr(s,2,t) + mulp(t,s,t) + nsqr(s,1,t) + mulp(v,s,w) + nsqr(s,5,v) + mulp(t,s,v) + nsqr(s,10,t) + mulp(t,s,t) + nsqr(s,5,t) + mulp(v,s,v) + nsqr(s,25,v) + mulp(t,s,v) + nsqr(s,50,t) + mulp(t,s,t) + nsqr(s,25,t) + mulp(v,s,v) + nsqr(s,125,v) + mulp(v,s,v) + nsqr(s,2,v) + mulp(s,s,w) + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + nsqr(v,1,s) + mulp(v,v,w) + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + mulp(s,u,s) + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// x4 = 0 <=> s^2 * w = 0 or 1 + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + bic x4, x0, #1 + orr x4, x4, x1 + orr x5, x2, x3 + orr x4, x4, x5 + +// x0 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + add x0, x0, #20 + add x1, x1, #1 + orr x0, x0, x1 + add x2, x2, #1 + eor x3, x3, #0x7FFFFFFFFFFFFFFF + orr x2, x2, x3 + orr x0, x0, x2 + +// If s^2 * w is not 0 or 1 then replace s by t + + cmp x4, xzr + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + stp x10, x11, [s] + stp x12, x13, [s+16] + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + ccmp x0, xzr, 4, ne + cset x0, ne + orr badun, badun, x0 + +// Let [x3;x2;x1;x0] = s and [x7;x6;x5;x4] = p_25519 - s + + ldp x0, x1, [s] + ldp x2, x3, [s+16] + mov x4, #-19 + subs x4, x4, x0 + mov x6, #-1 + sbcs x5, x6, x1 + sbcs x6, x6, x2 + mov x7, #0x7FFFFFFFFFFFFFFF + sbc x7, x7, x3 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + and x9, x0, #1 + eor sgnbit, x9, sgnbit + orr x8, x0, x1 + orr x9, x2, x3 + orr x8, x8, x9 + orr x10, badun, sgnbit + cmp x8, xzr + csel badun, x10, badun, eq + ccmp sgnbit, xzr, #4, ne + +// Actual selection of x as s or -s, copying of y and return of validity + + csel x0, x0, x4, eq + csel x1, x1, x5, eq + csel x2, x2, x6, eq + csel x3, x3, x7, eq + ldp x8, x9, [y] + ldp x10, x11, [y+16] + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x8, x9, [res, #32] + stp x10, x11, [res, #48] + + mov x0, badun + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_alt_mul_p25519: + ldp x3, x4, [x1] + ldp x7, x8, [x2] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x2, #16] + mul x11, x3, x9 + umulh x15, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x16, x3, x10 + adcs x15, x15, x11 + adc x16, x16, xzr + ldp x5, x6, [x1, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x15, x15, x11 + mul x11, x4, x10 + adcs x16, x16, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x15, x15, x11 + umulh x11, x4, x9 + adcs x16, x16, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x15, x15, x11 + mul x11, x5, x9 + adcs x16, x16, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x15, x15, x11 + umulh x11, x5, x8 + adcs x16, x16, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x15, x15, x11 + mul x11, x6, x8 + adcs x16, x16, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x16, x16, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + mov x7, #38 + mul x11, x7, x16 + umulh x9, x7, x16 + adds x12, x12, x11 + mul x11, x7, x3 + umulh x3, x7, x3 + adcs x13, x13, x11 + mul x11, x7, x4 + umulh x4, x7, x4 + adcs x14, x14, x11 + mul x11, x7, x5 + umulh x5, x7, x5 + adcs x15, x15, x11 + cset x16, hs + adds x15, x15, x4 + adc x16, x16, x5 + cmn x15, x15 + orr x15, x15, #0x8000000000000000 + adc x8, x16, x16 + mov x7, #19 + madd x11, x7, x8, x7 + adds x12, x12, x11 + adcs x13, x13, x9 + adcs x14, x14, x3 + adcs x15, x15, xzr + csel x7, x7, xzr, lo + subs x12, x12, x7 + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbc x15, x15, xzr + and x15, x15, #0x7fffffffffffffff + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_alt_nsqr_p25519: + +// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20 + + ldp x6, x3, [x2] + ldp x4, x5, [x2, #16] + mov x2, x6 + +// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_alt_loop: + mul x9, x2, x3 + umulh x10, x2, x3 + mul x11, x2, x5 + umulh x12, x2, x5 + mul x7, x2, x4 + umulh x6, x2, x4 + adds x10, x10, x7 + adcs x11, x11, x6 + mul x7, x3, x4 + umulh x6, x3, x4 + adc x6, x6, xzr + adds x11, x11, x7 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x6 + mul x7, x3, x5 + umulh x6, x3, x5 + adc x6, x6, xzr + adds x12, x12, x7 + adcs x13, x13, x6 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x6, hs + umulh x7, x2, x2 + mul x8, x2, x2 + adds x9, x9, x7 + mul x7, x3, x3 + adcs x10, x10, x7 + umulh x7, x3, x3 + adcs x11, x11, x7 + mul x7, x4, x4 + adcs x12, x12, x7 + umulh x7, x4, x4 + adcs x13, x13, x7 + mul x7, x5, x5 + adcs x14, x14, x7 + umulh x7, x5, x5 + adc x6, x6, x7 + mov x3, #38 + mul x7, x3, x12 + umulh x4, x3, x12 + adds x8, x8, x7 + mul x7, x3, x13 + umulh x13, x3, x13 + adcs x9, x9, x7 + mul x7, x3, x14 + umulh x14, x3, x14 + adcs x10, x10, x7 + mul x7, x3, x6 + umulh x6, x3, x6 + adcs x11, x11, x7 + cset x12, hs + adds x11, x11, x14 + adc x12, x12, x6 + cmn x11, x11 + bic x11, x11, #0x8000000000000000 + adc x2, x12, x12 + mov x3, #0x13 + mul x7, x3, x2 + adds x2, x8, x7 + adcs x3, x9, x4 + adcs x4, x10, x13 + adc x5, x11, xzr + +// Loop as applicable + + subs x1, x1, #1 + bne edwards25519_decode_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x2, #19 + adcs x7, x3, xzr + adcs x8, x4, xzr + adcs x9, x5, xzr + + csel x2, x2, x6, pl + csel x3, x3, x7, pl + csel x4, x4, x8, pl + csel x5, x5, x9, pl + bic x5, x5, #0x8000000000000000 + +// Copy result back into destination and return + + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/curve25519/edwards25519_encode.S b/arm/curve25519/edwards25519_encode.S new file mode 100644 index 0000000000..4cf301a227 --- /dev/null +++ b/arm/curve25519/edwards25519_encode.S @@ -0,0 +1,131 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Encode edwards25519 point into compressed form as 256-bit number +// Input p[8]; output z[32] (bytes) +// +// extern void edwards25519_encode +// (uint8_t z[static 32], uint64_t p[static 8]); +// +// This assumes that the input buffer p points to a pair of 256-bit +// numbers x (at p) and y (at p+4) representing a point (x,y) on the +// edwards25519 curve. It is assumed that both x and y are < p_25519 +// but there is no checking of this, nor of the fact that (x,y) is +// in fact on the curve. +// +// The output in z is a little-endian array of bytes corresponding to +// the standard compressed encoding of a point as 2^255 * x_0 + y +// where x_0 is the least significant bit of x. +// See "https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2" +// In this implementation, y is simply truncated to 255 bits, but if +// it is reduced mod p_25519 as expected this does not affect values. +// +// Standard ARM ABI: X0 = z, X1 = p +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_encode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_encode) + .text + .balign 4 + +#define z x0 +#define p x1 + +#define y0 x2 +#define y1 x3 +#define y2 x4 +#define y3 x5 +#define y0short w2 +#define y1short w3 +#define y2short w4 +#define y3short w5 +#define xb x6 + +S2N_BN_SYMBOL(edwards25519_encode): + +// Load lowest word of x coordinate in xb and full y as [y3;y2;y1;y0]. + + ldr xb, [p] + ldp y0, y1, [p, #32] + ldp y2, y3, [p, #48] + +// Compute the encoded form, making the LSB of x the MSB of the encoding + + and y3, y3, #0x7FFFFFFFFFFFFFFF + orr y3, y3, xb, lsl #63 + +// Write back in a byte-oriented fashion to be independent of endianness + + strb y0short, [z] + lsr y0, y0, #8 + strb y0short, [z, #1] + lsr y0, y0, #8 + strb y0short, [z, #2] + lsr y0, y0, #8 + strb y0short, [z, #3] + lsr y0, y0, #8 + strb y0short, [z, #4] + lsr y0, y0, #8 + strb y0short, [z, #5] + lsr y0, y0, #8 + strb y0short, [z, #6] + lsr y0, y0, #8 + strb y0short, [z, #7] + + strb y1short, [z, #8] + lsr y1, y1, #8 + strb y1short, [z, #9] + lsr y1, y1, #8 + strb y1short, [z, #10] + lsr y1, y1, #8 + strb y1short, [z, #11] + lsr y1, y1, #8 + strb y1short, [z, #12] + lsr y1, y1, #8 + strb y1short, [z, #13] + lsr y1, y1, #8 + strb y1short, [z, #14] + lsr y1, y1, #8 + strb y1short, [z, #15] + + strb y2short, [z, #16] + lsr y2, y2, #8 + strb y2short, [z, #17] + lsr y2, y2, #8 + strb y2short, [z, #18] + lsr y2, y2, #8 + strb y2short, [z, #19] + lsr y2, y2, #8 + strb y2short, [z, #20] + lsr y2, y2, #8 + strb y2short, [z, #21] + lsr y2, y2, #8 + strb y2short, [z, #22] + lsr y2, y2, #8 + strb y2short, [z, #23] + + strb y3short, [z, #24] + lsr y3, y3, #8 + strb y3short, [z, #25] + lsr y3, y3, #8 + strb y3short, [z, #26] + lsr y3, y3, #8 + strb y3short, [z, #27] + lsr y3, y3, #8 + strb y3short, [z, #28] + lsr y3, y3, #8 + strb y3short, [z, #29] + lsr y3, y3, #8 + strb y3short, [z, #30] + lsr y3, y3, #8 + strb y3short, [z, #31] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 6ca092489f..8c9d0f9193 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -956,346 +956,1045 @@ edwards25519_scalarmulbase_scalarloop: // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - mov x0, 4 - add x1, w_3 - add x2, z_3 - adr x3, edwards25519_scalarmulbase_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmulbase_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmulbase_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmulbase_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmulbase_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_wmontend -edwards25519_scalarmulbase_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wmontloop -edwards25519_scalarmulbase_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmulbase_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_zmontend -edwards25519_scalarmulbase_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zmontloop -edwards25519_scalarmulbase_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmulbase_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_negskip1 -edwards25519_scalarmulbase_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_negloop1 -edwards25519_scalarmulbase_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_negskip2 -edwards25519_scalarmulbase_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_negloop2 -edwards25519_scalarmulbase_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmulbase_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmulbase_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmulbase_outerloop + add x0, w_3 + add x1, z_3 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, w_3, x_3 +// and y_3. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmulbase_invmidloop +edwards25519_scalarmulbase_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmulbase_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmulbase_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1322,14 +2021,6 @@ edwards25519_scalarmulbase_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmulbase_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index e8dd9114a4..03e5598f2c 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -798,346 +798,1045 @@ edwards25519_scalarmulbase_alt_scalarloop: // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - mov x0, 4 - add x1, w_3 - add x2, z_3 - adr x3, edwards25519_scalarmulbase_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmulbase_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmulbase_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmulbase_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmulbase_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_alt_wmontend -edwards25519_scalarmulbase_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wmontloop -edwards25519_scalarmulbase_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmulbase_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_alt_zmontend -edwards25519_scalarmulbase_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zmontloop -edwards25519_scalarmulbase_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmulbase_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_alt_negskip1 -edwards25519_scalarmulbase_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_alt_negloop1 -edwards25519_scalarmulbase_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_alt_negskip2 -edwards25519_scalarmulbase_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_alt_negloop2 -edwards25519_scalarmulbase_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmulbase_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmulbase_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmulbase_alt_outerloop + add x0, w_3 + add x1, z_3 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, w_3, x_3 +// and y_3. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmulbase_alt_invmidloop +edwards25519_scalarmulbase_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmulbase_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmulbase_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1164,14 +1863,6 @@ edwards25519_scalarmulbase_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmulbase_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S new file mode 100644 index 0000000000..00ea37eaaf --- /dev/null +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -0,0 +1,3157 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Double scalar multiplication for edwards25519, fresh and base point +// Input scalar[4], point[8], bscalar[4]; output res[8] +// +// extern void edwards25519_scalarmuldouble +// (uint64_t res[static 8],uint64_t scalar[static 4], +// uint64_t point[static 8],uint64_t bscalar[static 4]); +// +// Given scalar = n, point = P and bscalar = m, returns in res +// the point (X,Y) = n * P + m * B where B = (...,4/5) is +// the standard basepoint for the edwards25519 (Ed25519) curve. +// +// Both 256-bit coordinates of the input point P are implicitly +// reduced modulo 2^255-19 if they are not already in reduced form, +// but the conventional usage is that they *are* already reduced. +// The scalars can be arbitrary 256-bit numbers but may also be +// considered as implicitly reduced modulo the group order. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point, X3 = bscalar +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_scalarmuldouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_scalarmuldouble) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable home for the input result argument during the whole body + +#define res x25 + +// Additional pointer variables for local subroutines + +#define p0 x22 +#define p1 x23 +#define p2 x24 + +// Other variables that are only needed prior to the modular inverse. + +#define i x19 +#define bf x20 +#define cf x21 + +// Pointer-offset pairs for result and temporaries on stack with some aliasing. + +#define resx res, #(0*NUMSIZE) +#define resy res, #(1*NUMSIZE) + +#define scalar sp, #(0*NUMSIZE) +#define bscalar sp, #(1*NUMSIZE) + +#define btabent sp, #(2*NUMSIZE) +#define acc sp, #(5*NUMSIZE) +#define acc_x sp, #(5*NUMSIZE) +#define acc_y sp, #(6*NUMSIZE) +#define acc_z sp, #(7*NUMSIZE) +#define acc_w sp, #(8*NUMSIZE) + +#define tabent sp, #(9*NUMSIZE) + +#define tab sp, #(13*NUMSIZE) + +// Total size to reserve on the stack (excluding local subroutines) + +#define NSPACE (45*NUMSIZE) + +// Sub-references used in local subroutines with local stack + +#define x_0 p0, #0 +#define y_0 p0, #NUMSIZE +#define z_0 p0, #(2*NUMSIZE) +#define w_0 p0, #(3*NUMSIZE) + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) +#define w_1 p1, #(3*NUMSIZE) + +#define x_2 p2, #0 +#define y_2 p2, #NUMSIZE +#define z_2 p2, #(2*NUMSIZE) +#define w_2 p2, #(3*NUMSIZE) + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) +#define t5 sp, #(5*NUMSIZE) + +// Load 64-bit immediate into a register + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// Macro wrapping up the basic field operation bignum_mul_p25519, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + umull x7, w3, w5; \ + lsr x0, x3, #32; \ + umull x15, w0, w5; \ + lsr x16, x5, #32; \ + umull x8, w16, w0; \ + umull x16, w3, w16; \ + adds x7, x7, x15, lsl #32; \ + lsr x15, x15, #32; \ + adc x8, x8, x15; \ + adds x7, x7, x16, lsl #32; \ + lsr x16, x16, #32; \ + adc x8, x8, x16; \ + mul x9, x4, x6; \ + umulh x10, x4, x6; \ + subs x4, x4, x3; \ + cneg x4, x4, cc; \ + csetm x16, cc; \ + adds x9, x9, x8; \ + adc x10, x10, xzr; \ + subs x3, x5, x6; \ + cneg x3, x3, cc; \ + cinv x16, x16, cc; \ + mul x15, x4, x3; \ + umulh x3, x4, x3; \ + adds x8, x7, x9; \ + adcs x9, x9, x10; \ + adc x10, x10, xzr; \ + cmn x16, #0x1; \ + eor x15, x15, x16; \ + adcs x8, x15, x8; \ + eor x3, x3, x16; \ + adcs x9, x3, x9; \ + adc x10, x10, x16; \ + ldp x3, x4, [P1+16]; \ + ldp x5, x6, [P2+16]; \ + umull x11, w3, w5; \ + lsr x0, x3, #32; \ + umull x15, w0, w5; \ + lsr x16, x5, #32; \ + umull x12, w16, w0; \ + umull x16, w3, w16; \ + adds x11, x11, x15, lsl #32; \ + lsr x15, x15, #32; \ + adc x12, x12, x15; \ + adds x11, x11, x16, lsl #32; \ + lsr x16, x16, #32; \ + adc x12, x12, x16; \ + mul x13, x4, x6; \ + umulh x14, x4, x6; \ + subs x4, x4, x3; \ + cneg x4, x4, cc; \ + csetm x16, cc; \ + adds x13, x13, x12; \ + adc x14, x14, xzr; \ + subs x3, x5, x6; \ + cneg x3, x3, cc; \ + cinv x16, x16, cc; \ + mul x15, x4, x3; \ + umulh x3, x4, x3; \ + adds x12, x11, x13; \ + adcs x13, x13, x14; \ + adc x14, x14, xzr; \ + cmn x16, #0x1; \ + eor x15, x15, x16; \ + adcs x12, x15, x12; \ + eor x3, x3, x16; \ + adcs x13, x3, x13; \ + adc x14, x14, x16; \ + ldp x3, x4, [P1+16]; \ + ldp x15, x16, [P1]; \ + subs x3, x3, x15; \ + sbcs x4, x4, x16; \ + csetm x16, cc; \ + ldp x15, x0, [P2]; \ + subs x5, x15, x5; \ + sbcs x6, x0, x6; \ + csetm x0, cc; \ + eor x3, x3, x16; \ + subs x3, x3, x16; \ + eor x4, x4, x16; \ + sbc x4, x4, x16; \ + eor x5, x5, x0; \ + subs x5, x5, x0; \ + eor x6, x6, x0; \ + sbc x6, x6, x0; \ + eor x16, x0, x16; \ + adds x11, x11, x9; \ + adcs x12, x12, x10; \ + adcs x13, x13, xzr; \ + adc x14, x14, xzr; \ + mul x2, x3, x5; \ + umulh x0, x3, x5; \ + mul x15, x4, x6; \ + umulh x1, x4, x6; \ + subs x4, x4, x3; \ + cneg x4, x4, cc; \ + csetm x9, cc; \ + adds x15, x15, x0; \ + adc x1, x1, xzr; \ + subs x6, x5, x6; \ + cneg x6, x6, cc; \ + cinv x9, x9, cc; \ + mul x5, x4, x6; \ + umulh x6, x4, x6; \ + adds x0, x2, x15; \ + adcs x15, x15, x1; \ + adc x1, x1, xzr; \ + cmn x9, #0x1; \ + eor x5, x5, x9; \ + adcs x0, x5, x0; \ + eor x6, x6, x9; \ + adcs x15, x6, x15; \ + adc x1, x1, x9; \ + adds x9, x11, x7; \ + adcs x10, x12, x8; \ + adcs x11, x13, x11; \ + adcs x12, x14, x12; \ + adcs x13, x13, xzr; \ + adc x14, x14, xzr; \ + cmn x16, #0x1; \ + eor x2, x2, x16; \ + adcs x9, x2, x9; \ + eor x0, x0, x16; \ + adcs x10, x0, x10; \ + eor x15, x15, x16; \ + adcs x11, x15, x11; \ + eor x1, x1, x16; \ + adcs x12, x1, x12; \ + adcs x13, x13, x16; \ + adc x14, x14, x16; \ + mov x3, #0x26; \ + umull x4, w11, w3; \ + add x4, x4, w7, uxtw; \ + lsr x7, x7, #32; \ + lsr x11, x11, #32; \ + umaddl x11, w11, w3, x7; \ + mov x7, x4; \ + umull x4, w12, w3; \ + add x4, x4, w8, uxtw; \ + lsr x8, x8, #32; \ + lsr x12, x12, #32; \ + umaddl x12, w12, w3, x8; \ + mov x8, x4; \ + umull x4, w13, w3; \ + add x4, x4, w9, uxtw; \ + lsr x9, x9, #32; \ + lsr x13, x13, #32; \ + umaddl x13, w13, w3, x9; \ + mov x9, x4; \ + umull x4, w14, w3; \ + add x4, x4, w10, uxtw; \ + lsr x10, x10, #32; \ + lsr x14, x14, #32; \ + umaddl x14, w14, w3, x10; \ + mov x10, x4; \ + lsr x0, x14, #31; \ + mov x5, #0x13; \ + umaddl x5, w5, w0, x5; \ + add x7, x7, x5; \ + adds x7, x7, x11, lsl #32; \ + extr x3, x12, x11, #32; \ + adcs x8, x8, x3; \ + extr x3, x13, x12, #32; \ + adcs x9, x9, x3; \ + extr x3, x14, x13, #32; \ + lsl x5, x0, #63; \ + eor x10, x10, x5; \ + adc x10, x10, x3; \ + mov x3, #0x13; \ + tst x10, #0x8000000000000000; \ + csel x3, x3, xzr, pl; \ + subs x7, x7, x3; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + and x10, x10, #0x7fffffffffffffff; \ + stp x7, x8, [P0]; \ + stp x9, x10, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + umull x7, w3, w5; \ + lsr x0, x3, #32; \ + umull x15, w0, w5; \ + lsr x16, x5, #32; \ + umull x8, w16, w0; \ + umull x16, w3, w16; \ + adds x7, x7, x15, lsl #32; \ + lsr x15, x15, #32; \ + adc x8, x8, x15; \ + adds x7, x7, x16, lsl #32; \ + lsr x16, x16, #32; \ + adc x8, x8, x16; \ + mul x9, x4, x6; \ + umulh x10, x4, x6; \ + subs x4, x4, x3; \ + cneg x4, x4, cc; \ + csetm x16, cc; \ + adds x9, x9, x8; \ + adc x10, x10, xzr; \ + subs x3, x5, x6; \ + cneg x3, x3, cc; \ + cinv x16, x16, cc; \ + mul x15, x4, x3; \ + umulh x3, x4, x3; \ + adds x8, x7, x9; \ + adcs x9, x9, x10; \ + adc x10, x10, xzr; \ + cmn x16, #0x1; \ + eor x15, x15, x16; \ + adcs x8, x15, x8; \ + eor x3, x3, x16; \ + adcs x9, x3, x9; \ + adc x10, x10, x16; \ + ldp x3, x4, [P1+16]; \ + ldp x5, x6, [P2+16]; \ + umull x11, w3, w5; \ + lsr x0, x3, #32; \ + umull x15, w0, w5; \ + lsr x16, x5, #32; \ + umull x12, w16, w0; \ + umull x16, w3, w16; \ + adds x11, x11, x15, lsl #32; \ + lsr x15, x15, #32; \ + adc x12, x12, x15; \ + adds x11, x11, x16, lsl #32; \ + lsr x16, x16, #32; \ + adc x12, x12, x16; \ + mul x13, x4, x6; \ + umulh x14, x4, x6; \ + subs x4, x4, x3; \ + cneg x4, x4, cc; \ + csetm x16, cc; \ + adds x13, x13, x12; \ + adc x14, x14, xzr; \ + subs x3, x5, x6; \ + cneg x3, x3, cc; \ + cinv x16, x16, cc; \ + mul x15, x4, x3; \ + umulh x3, x4, x3; \ + adds x12, x11, x13; \ + adcs x13, x13, x14; \ + adc x14, x14, xzr; \ + cmn x16, #0x1; \ + eor x15, x15, x16; \ + adcs x12, x15, x12; \ + eor x3, x3, x16; \ + adcs x13, x3, x13; \ + adc x14, x14, x16; \ + ldp x3, x4, [P1+16]; \ + ldp x15, x16, [P1]; \ + subs x3, x3, x15; \ + sbcs x4, x4, x16; \ + csetm x16, cc; \ + ldp x15, x0, [P2]; \ + subs x5, x15, x5; \ + sbcs x6, x0, x6; \ + csetm x0, cc; \ + eor x3, x3, x16; \ + subs x3, x3, x16; \ + eor x4, x4, x16; \ + sbc x4, x4, x16; \ + eor x5, x5, x0; \ + subs x5, x5, x0; \ + eor x6, x6, x0; \ + sbc x6, x6, x0; \ + eor x16, x0, x16; \ + adds x11, x11, x9; \ + adcs x12, x12, x10; \ + adcs x13, x13, xzr; \ + adc x14, x14, xzr; \ + mul x2, x3, x5; \ + umulh x0, x3, x5; \ + mul x15, x4, x6; \ + umulh x1, x4, x6; \ + subs x4, x4, x3; \ + cneg x4, x4, cc; \ + csetm x9, cc; \ + adds x15, x15, x0; \ + adc x1, x1, xzr; \ + subs x6, x5, x6; \ + cneg x6, x6, cc; \ + cinv x9, x9, cc; \ + mul x5, x4, x6; \ + umulh x6, x4, x6; \ + adds x0, x2, x15; \ + adcs x15, x15, x1; \ + adc x1, x1, xzr; \ + cmn x9, #0x1; \ + eor x5, x5, x9; \ + adcs x0, x5, x0; \ + eor x6, x6, x9; \ + adcs x15, x6, x15; \ + adc x1, x1, x9; \ + adds x9, x11, x7; \ + adcs x10, x12, x8; \ + adcs x11, x13, x11; \ + adcs x12, x14, x12; \ + adcs x13, x13, xzr; \ + adc x14, x14, xzr; \ + cmn x16, #0x1; \ + eor x2, x2, x16; \ + adcs x9, x2, x9; \ + eor x0, x0, x16; \ + adcs x10, x0, x10; \ + eor x15, x15, x16; \ + adcs x11, x15, x11; \ + eor x1, x1, x16; \ + adcs x12, x1, x12; \ + adcs x13, x13, x16; \ + adc x14, x14, x16; \ + mov x3, #0x26; \ + umull x4, w11, w3; \ + add x4, x4, w7, uxtw; \ + lsr x7, x7, #32; \ + lsr x11, x11, #32; \ + umaddl x11, w11, w3, x7; \ + mov x7, x4; \ + umull x4, w12, w3; \ + add x4, x4, w8, uxtw; \ + lsr x8, x8, #32; \ + lsr x12, x12, #32; \ + umaddl x12, w12, w3, x8; \ + mov x8, x4; \ + umull x4, w13, w3; \ + add x4, x4, w9, uxtw; \ + lsr x9, x9, #32; \ + lsr x13, x13, #32; \ + umaddl x13, w13, w3, x9; \ + mov x9, x4; \ + umull x4, w14, w3; \ + add x4, x4, w10, uxtw; \ + lsr x10, x10, #32; \ + lsr x14, x14, #32; \ + umaddl x14, w14, w3, x10; \ + mov x10, x4; \ + lsr x0, x14, #31; \ + mov x5, #0x13; \ + umull x5, w5, w0; \ + add x7, x7, x5; \ + adds x7, x7, x11, lsl #32; \ + extr x3, x12, x11, #32; \ + adcs x8, x8, x3; \ + extr x3, x13, x12, #32; \ + adcs x9, x9, x3; \ + extr x3, x14, x13, #32; \ + lsl x5, x0, #63; \ + eor x10, x10, x5; \ + adc x10, x10, x3; \ + stp x7, x8, [P0]; \ + stp x9, x10, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x10, x11, [P1]; \ + ldp x12, x13, [P1+16]; \ + umull x2, w10, w10; \ + lsr x14, x10, #32; \ + umull x3, w14, w14; \ + umull x14, w10, w14; \ + adds x2, x2, x14, lsl #33; \ + lsr x14, x14, #31; \ + adc x3, x3, x14; \ + umull x4, w11, w11; \ + lsr x14, x11, #32; \ + umull x5, w14, w14; \ + umull x14, w11, w14; \ + mul x15, x10, x11; \ + umulh x16, x10, x11; \ + adds x4, x4, x14, lsl #33; \ + lsr x14, x14, #31; \ + adc x5, x5, x14; \ + adds x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, x5, xzr; \ + adds x3, x3, x15; \ + adcs x4, x4, x16; \ + adc x5, x5, xzr; \ + umull x6, w12, w12; \ + lsr x14, x12, #32; \ + umull x7, w14, w14; \ + umull x14, w12, w14; \ + adds x6, x6, x14, lsl #33; \ + lsr x14, x14, #31; \ + adc x7, x7, x14; \ + umull x8, w13, w13; \ + lsr x14, x13, #32; \ + umull x9, w14, w14; \ + umull x14, w13, w14; \ + mul x15, x12, x13; \ + umulh x16, x12, x13; \ + adds x8, x8, x14, lsl #33; \ + lsr x14, x14, #31; \ + adc x9, x9, x14; \ + adds x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x9, x9, xzr; \ + adds x7, x7, x15; \ + adcs x8, x8, x16; \ + adc x9, x9, xzr; \ + subs x10, x10, x12; \ + sbcs x11, x11, x13; \ + csetm x16, cc; \ + eor x10, x10, x16; \ + subs x10, x10, x16; \ + eor x11, x11, x16; \ + sbc x11, x11, x16; \ + adds x6, x6, x4; \ + adcs x7, x7, x5; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + umull x12, w10, w10; \ + lsr x5, x10, #32; \ + umull x13, w5, w5; \ + umull x5, w10, w5; \ + adds x12, x12, x5, lsl #33; \ + lsr x5, x5, #31; \ + adc x13, x13, x5; \ + umull x15, w11, w11; \ + lsr x5, x11, #32; \ + umull x14, w5, w5; \ + umull x5, w11, w5; \ + mul x4, x10, x11; \ + umulh x16, x10, x11; \ + adds x15, x15, x5, lsl #33; \ + lsr x5, x5, #31; \ + adc x14, x14, x5; \ + adds x4, x4, x4; \ + adcs x16, x16, x16; \ + adc x14, x14, xzr; \ + adds x13, x13, x4; \ + adcs x15, x15, x16; \ + adc x14, x14, xzr; \ + adds x4, x2, x6; \ + adcs x5, x3, x7; \ + adcs x6, x6, x8; \ + adcs x7, x7, x9; \ + csetm x16, cc; \ + subs x4, x4, x12; \ + sbcs x5, x5, x13; \ + sbcs x6, x6, x15; \ + sbcs x7, x7, x14; \ + adcs x8, x8, x16; \ + adc x9, x9, x16; \ + mov x10, #0x26; \ + umull x12, w6, w10; \ + add x12, x12, w2, uxtw; \ + lsr x2, x2, #32; \ + lsr x6, x6, #32; \ + umaddl x6, w6, w10, x2; \ + mov x2, x12; \ + umull x12, w7, w10; \ + add x12, x12, w3, uxtw; \ + lsr x3, x3, #32; \ + lsr x7, x7, #32; \ + umaddl x7, w7, w10, x3; \ + mov x3, x12; \ + umull x12, w8, w10; \ + add x12, x12, w4, uxtw; \ + lsr x4, x4, #32; \ + lsr x8, x8, #32; \ + umaddl x8, w8, w10, x4; \ + mov x4, x12; \ + umull x12, w9, w10; \ + add x12, x12, w5, uxtw; \ + lsr x5, x5, #32; \ + lsr x9, x9, #32; \ + umaddl x9, w9, w10, x5; \ + mov x5, x12; \ + lsr x13, x9, #31; \ + mov x11, #0x13; \ + umull x11, w11, w13; \ + add x2, x2, x11; \ + adds x2, x2, x6, lsl #32; \ + extr x10, x7, x6, #32; \ + adcs x3, x3, x10; \ + extr x10, x8, x7, #32; \ + adcs x4, x4, x10; \ + extr x10, x9, x8, #32; \ + lsl x11, x13, #63; \ + eor x5, x5, x11; \ + adc x5, x5, x10; \ + stp x2, x3, [P0]; \ + stp x4, x5, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + mov x4, #38; \ + csel x3, x4, xzr, lo; \ + subs x5, x5, x3; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbc x8, x8, xzr; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16] + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x7, x8, [P2]; \ + adds x3, x3, x7; \ + adcs x4, x4, x8; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P2+16]; \ + adcs x5, x5, x7; \ + adcs x6, x6, x8; \ + mov x9, #38; \ + csel x9, x9, xzr, cs; \ + adds x3, x3, x9; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adc x6, x6, xzr; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16] + +#define double_twice4(P0,P1) \ + ldp x3, x4, [P1]; \ + adds x3, x3, x3; \ + adcs x4, x4, x4; \ + ldp x5, x6, [P1+16]; \ + adcs x5, x5, x5; \ + adcs x6, x6, x6; \ + mov x9, #38; \ + csel x9, x9, xzr, cs; \ + adds x3, x3, x9; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adc x6, x6, xzr; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16] + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movz x0, #0xf159; \ + movz x1, #0xb156; \ + movz x2, #0xd130; \ + movz x3, #0xfce7; \ + movk x0, #0x26b2, lsl #16; \ + movk x1, #0x8283, lsl #16; \ + movk x2, #0xeef3, lsl #16; \ + movk x3, #0x56df, lsl #16; \ + movk x0, #0x9b94, lsl #32; \ + movk x1, #0x149a, lsl #32; \ + movk x2, #0x80f2, lsl #32; \ + movk x3, #0xd9dc, lsl #32; \ + movk x0, #0xebd6, lsl #48; \ + movk x1, #0x00e0, lsl #48; \ + movk x2, #0x198e, lsl #48; \ + movk x3, #0x2406, lsl #48; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16] + +S2N_BN_SYMBOL(edwards25519_scalarmuldouble): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + stp x25, x30, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the output pointer to a stable place + + mov res, x0 + +// Copy scalars while recoding all 4-bit nybbles except the top +// one (bits 252..255) into signed 4-bit digits. This is essentially +// done just by adding the recoding constant 0x0888..888, after +// which all digits except the first have an implicit bias of -8, +// so 0 -> -8, 1 -> -7, ... 7 -> -1, 8 -> 0, 9 -> 1, ... 15 -> 7. +// (We could literally create 2s complement signed nybbles by +// XORing with the same constant 0x0888..888 afterwards, but it +// doesn't seem to make the end usage any simpler.) +// +// In order to ensure that the unrecoded top nybble (bits 252..255) +// does not become > 8 as a result of carries lower down from the +// recoding, we first (conceptually) subtract the group order iff +// the top digit of the scalar is > 2^63. In the implementation the +// reduction and recoding are combined by optionally using the +// modified recoding constant 0x0888...888 + (2^256 - group_order). + + movbig(x4,#0xc7f5, #0x6fb5, #0xa0d9, #0xe920) + movbig(x5,#0xe190, #0xb993, #0x70cb, #0xa1d5) + mov x7, #0x8888888888888888 + sub x6, x7, #1 + bic x8, x7, #0xF000000000000000 + + ldp x10, x11, [x3] + ldp x12, x13, [x3, #16] + mov x3, 0x8000000000000000 + cmp x3, x13 + csel x14, x7, x4, cs + csel x15, x7, x5, cs + csel x16, x7, x6, cs + csel x17, x8, x7, cs + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adc x13, x13, x17 + stp x10, x11, [bscalar] + stp x12, x13, [bscalar+16] + + ldp x10, x11, [x1] + ldp x12, x13, [x1, #16] + mov x3, 0x8000000000000000 + cmp x3, x13 + csel x14, x7, x4, cs + csel x15, x7, x5, cs + csel x16, x7, x6, cs + csel x17, x8, x7, cs + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adc x13, x13, x17 + stp x10, x11, [scalar] + stp x12, x13, [scalar+16] + +// Create table of multiples 1..8 of the general input point at "tab". +// Reduce the input coordinates x and y modulo 2^256 - 38 first, for the +// sake of definiteness; this is the reduction that will be maintained. +// We could slightly optimize the additions because we know the input +// point is affine (so Z = 1), but it doesn't seem worth the complication. + + ldp x10, x11, [x2] + ldp x12, x13, [x2, #16] + adds x14, x10, #38 + adcs x15, x11, xzr + adcs x16, x12, xzr + adcs x17, x13, xzr + csel x10, x14, x10, cs + csel x11, x15, x11, cs + csel x12, x16, x12, cs + csel x13, x17, x13, cs + stp x10, x11, [tab] + stp x12, x13, [tab+16] + + ldp x10, x11, [x2, #32] + ldp x12, x13, [x2, #48] + adds x14, x10, #38 + adcs x15, x11, xzr + adcs x16, x12, xzr + adcs x17, x13, xzr + csel x10, x14, x10, cs + csel x11, x15, x11, cs + csel x12, x16, x12, cs + csel x13, x17, x13, cs + stp x10, x11, [tab+32] + stp x12, x13, [tab+48] + + mov x1, #1 + stp x1, xzr, [tab+64] + stp xzr, xzr, [tab+80] + + add p0, tab+96 + add p1, tab + add p2, tab+32 + mul_4(x_0,x_1,x_2) + +// Multiple 2 + + add p0, tab+1*128 + add p1, tab + bl edwards25519_scalarmuldouble_epdouble + +// Multiple 3 + + add p0, tab+2*128 + add p1, tab + add p2, tab+1*128 + bl edwards25519_scalarmuldouble_epadd + +// Multiple 4 + + add p0, tab+3*128 + add p1, tab+1*128 + bl edwards25519_scalarmuldouble_epdouble + +// Multiple 5 + + add p0, tab+4*128 + add p1, tab + add p2, tab+3*128 + bl edwards25519_scalarmuldouble_epadd + +// Multiple 6 + + add p0, tab+5*128 + add p1, tab+2*128 + bl edwards25519_scalarmuldouble_epdouble + +// Multiple 7 + + add p0, tab+6*128 + add p1, tab + add p2, tab+5*128 + bl edwards25519_scalarmuldouble_epadd + +// Multiple 8 + + add p0, tab+7*128 + add p1, tab+3*128 + bl edwards25519_scalarmuldouble_epdouble + +// Handle the initialization, starting the loop counter at i = 252 +// and initializing acc to the sum of the table entries for the +// top nybbles of the scalars (the ones with no implicit -8 bias). + + mov i, #252 + +// Index for btable entry... + + ldr x0, [bscalar+24] + lsr bf, x0, #60 + +// ...and constant-time indexing based on that index + + adr x14, edwards25519_scalarmuldouble_table + + mov x0, #1 + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + + cmp bf, #1 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #2 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #3 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #4 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #5 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #6 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #7 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #8 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + + stp x0, x1, [btabent] + stp x2, x3, [btabent+16] + stp x4, x5, [btabent+32] + stp x6, x7, [btabent+48] + stp x8, x9, [btabent+64] + stp x10, x11, [btabent+80] + +// Index for table entry... + + ldr x0, [scalar+24] + lsr bf, x0, #60 + +// ...and constant-time indexing based on that index + + add p0, tab + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, #1 + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + + cmp bf, #1 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #2 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #3 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #4 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #5 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #6 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #7 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #8 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + stp x12, x13, [tabent+96] + stp x14, x15, [tabent+112] + +// Add those elements to initialize the accumulator for bit position 252 + + add p0, acc + add p1, tabent + add p2, btabent + bl edwards25519_scalarmuldouble_pepadd + +// Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint +// Start with i = 252 for bits 248..251 and go down four at a time to 3..0 + +edwards25519_scalarmuldouble_loop: + + sub i, i, #4 + +// Double to acc' = 2 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_pdouble + +// Get btable entry, first getting the adjusted bitfield... + + lsr x0, i, #6 + add x1, bscalar + ldr x2, [x1, x0, lsl #3] + lsr x3, x2, i + and x0, x3, #15 + subs bf, x0, #8 + cneg bf, bf, cc + csetm cf, cc + +// ... then doing constant-time lookup with the appropriate index... + + adr x14, edwards25519_scalarmuldouble_table + + mov x0, #1 + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + + cmp bf, #1 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #2 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #3 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #4 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #5 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #6 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #7 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #8 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + +// ... then optionally negating before storing. The table entry +// is in precomputed form and we currently have +// +// [x3;x2;x1;x0] = y - x +// [x7;x6;x5;x4] = x + y +// [x11;x10;x9;x8] = 2 * d * x * y +// +// Negation for Edwards curves is -(x,y) = (-x,y), which in this modified +// form amounts to swapping the first two fields and negating the third. +// The negation does not always fully reduce even mod 2^256-38 in the zero +// case, instead giving -0 = 2^256-38. But that is fine since the result is +// always fed to a multipliction inside the "pepadd" function below that +// handles any 256-bit input. + + cmp cf, xzr + + csel x12, x0, x4, eq + csel x4, x0, x4, ne + csel x13, x1, x5, eq + csel x5, x1, x5, ne + csel x14, x2, x6, eq + csel x6, x2, x6, ne + csel x15, x3, x7, eq + csel x7, x3, x7, ne + + eor x8, x8, cf + eor x9, x9, cf + eor x10, x10, cf + eor x11, x11, cf + mov x0, #37 + and x0, x0, cf + subs x8, x8, x0 + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbc x11, x11, xzr + + stp x12, x13, [btabent] + stp x14, x15, [btabent+16] + stp x4, x5, [btabent+32] + stp x6, x7, [btabent+48] + stp x8, x9, [btabent+64] + stp x10, x11, [btabent+80] + +// Get table entry, first getting the adjusted bitfield... + + lsr x0, i, #6 + ldr x1, [sp, x0, lsl #3] + lsr x2, x1, i + and x0, x2, #15 + subs bf, x0, #8 + cneg bf, bf, cc + csetm cf, cc + +// ... then getting the unadjusted table entry + + add p0, tab + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, #1 + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + + cmp bf, #1 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #2 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #3 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #4 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #5 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #6 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #7 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #8 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + +// ... then optionally negating before storing. This time the table +// entry is extended-projective, and is in registers thus: +// +// [x3;x2;x1;x0] = X +// [x7;x6;x5;x4] = Y +// [x11;x10;x9;x8] = Z +// [x15;x14;x13;x12] = W +// +// This time we just need to negate the X and the W fields. +// The crude way negation is done can result in values of X or W +// (when initially zero before negation) being exactly equal to +// 2^256-38, but the "pepadd" function handles that correctly. + + eor x0, x0, cf + eor x1, x1, cf + eor x2, x2, cf + eor x3, x3, cf + mov x16, #37 + and x16, x16, cf + subs x0, x0, x16 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x3, x3, xzr + + eor x12, x12, cf + eor x13, x13, cf + eor x14, x14, cf + eor x15, x15, cf + subs x12, x12, x16 + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbc x15, x15, xzr + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + stp x12, x13, [tabent+96] + stp x14, x15, [tabent+112] + +// Double to acc' = 4 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_pdouble + +// Add tabent := tabent + btabent + + add p0, tabent + add p1, tabent + add p2, btabent + bl edwards25519_scalarmuldouble_pepadd + +// Double to acc' = 8 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_pdouble + +// Double to acc' = 16 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_epdouble + +// Add table entry, acc := acc + tabent + + add p0, acc + add p1, acc + add p2, tabent + bl edwards25519_scalarmuldouble_epadd + +// Loop down + + cbnz i, edwards25519_scalarmuldouble_loop + +// Modular inverse setup + + add x0, tabent + add x1, acc+64 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, acc, tabent. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmuldouble_invmidloop +edwards25519_scalarmuldouble_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmuldouble_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmuldouble_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] + +// Store result. Note that these are the only reductions mod 2^255-19 + + mov p0, res + add p1, acc + add p2, tabent + mul_p25519(x_0,x_1,x_2) + + add p0, res, #32 + add p1, acc+32 + add p2, tabent + mul_p25519(x_0,x_1,x_2) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x25, x30, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +// **************************************************************************** +// Localized versions of subroutines. +// These are close to the standalone functions "edwards25519_epdouble" etc., +// but are only maintaining reduction modulo 2^256 - 38, not 2^255 - 19. +// **************************************************************************** + +edwards25519_scalarmuldouble_epdouble: + sub sp, sp, #(5*NUMSIZE) + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(w_0,t1,t4) + mul_4(x_0,t1,t3) + add sp, sp, #(5*NUMSIZE) + ret + +edwards25519_scalarmuldouble_pdouble: + sub sp, sp, #(5*NUMSIZE) + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(x_0,t1,t3) + add sp, sp, #(5*NUMSIZE) + ret + +edwards25519_scalarmuldouble_epadd: + sub sp, sp, #(6*NUMSIZE) + mul_4(t0,w_1,w_2) + sub_twice4(t1,y_1,x_1) + sub_twice4(t2,y_2,x_2) + add_twice4(t3,y_1,x_1) + add_twice4(t4,y_2,x_2) + double_twice4(t5,z_2) + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + load_k25519(t2) + mul_4(t2,t2,t0) + mul_4(t4,z_1,t5) + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + mul_4(w_0,t0,t5) + mul_4(x_0,t0,t1) + mul_4(y_0,t3,t5) + mul_4(z_0,t1,t3) + add sp, sp, #(6*NUMSIZE) + ret + +edwards25519_scalarmuldouble_pepadd: + sub sp, sp, #(6*NUMSIZE) + double_twice4(t0,z_1); + sub_twice4(t1,y_1,x_1); + add_twice4(t2,y_1,x_1); + mul_4(t3,w_1,z_2); + mul_4(t1,t1,x_2); + mul_4(t2,t2,y_2); + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + mul_4(z_0,t4,t0); + mul_4(x_0,t5,t4); + mul_4(y_0,t0,t1); + mul_4(w_0,t5,t1); + add sp, sp, #(6*NUMSIZE) + ret + +// **************************************************************************** +// The precomputed data (all read-only). This is currently part of the same +// text section, which gives position-independent code with simple PC-relative +// addressing. However it could be put in a separate section via something like +// +// .section .rodata +// **************************************************************************** + +// Precomputed table of multiples of generator for edwards25519 +// all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. + +edwards25519_scalarmuldouble_table: + + // 1 * G + + .quad 0x9d103905d740913e + .quad 0xfd399f05d140beb3 + .quad 0xa5c18434688f8a09 + .quad 0x44fd2f9298f81267 + .quad 0x2fbc93c6f58c3b85 + .quad 0xcf932dc6fb8c0e19 + .quad 0x270b4898643d42c2 + .quad 0x07cf9d3a33d4ba65 + .quad 0xabc91205877aaa68 + .quad 0x26d9e823ccaac49e + .quad 0x5a1b7dcbdd43598c + .quad 0x6f117b689f0c65a8 + + // 2 * G + + .quad 0x8a99a56042b4d5a8 + .quad 0x8f2b810c4e60acf6 + .quad 0xe09e236bb16e37aa + .quad 0x6bb595a669c92555 + .quad 0x9224e7fc933c71d7 + .quad 0x9f469d967a0ff5b5 + .quad 0x5aa69a65e1d60702 + .quad 0x590c063fa87d2e2e + .quad 0x43faa8b3a59b7a5f + .quad 0x36c16bdd5d9acf78 + .quad 0x500fa0840b3d6a31 + .quad 0x701af5b13ea50b73 + + // 3 * G + + .quad 0x56611fe8a4fcd265 + .quad 0x3bd353fde5c1ba7d + .quad 0x8131f31a214bd6bd + .quad 0x2ab91587555bda62 + .quad 0xaf25b0a84cee9730 + .quad 0x025a8430e8864b8a + .quad 0xc11b50029f016732 + .quad 0x7a164e1b9a80f8f4 + .quad 0x14ae933f0dd0d889 + .quad 0x589423221c35da62 + .quad 0xd170e5458cf2db4c + .quad 0x5a2826af12b9b4c6 + + // 4 * G + + .quad 0x95fe050a056818bf + .quad 0x327e89715660faa9 + .quad 0xc3e8e3cd06a05073 + .quad 0x27933f4c7445a49a + .quad 0x287351b98efc099f + .quad 0x6765c6f47dfd2538 + .quad 0xca348d3dfb0a9265 + .quad 0x680e910321e58727 + .quad 0x5a13fbe9c476ff09 + .quad 0x6e9e39457b5cc172 + .quad 0x5ddbdcf9102b4494 + .quad 0x7f9d0cbf63553e2b + + // 5 * G + + .quad 0x7f9182c3a447d6ba + .quad 0xd50014d14b2729b7 + .quad 0xe33cf11cb864a087 + .quad 0x154a7e73eb1b55f3 + .quad 0xa212bc4408a5bb33 + .quad 0x8d5048c3c75eed02 + .quad 0xdd1beb0c5abfec44 + .quad 0x2945ccf146e206eb + .quad 0xbcbbdbf1812a8285 + .quad 0x270e0807d0bdd1fc + .quad 0xb41b670b1bbda72d + .quad 0x43aabe696b3bb69a + + // 6 * G + + .quad 0x499806b67b7d8ca4 + .quad 0x575be28427d22739 + .quad 0xbb085ce7204553b9 + .quad 0x38b64c41ae417884 + .quad 0x3a0ceeeb77157131 + .quad 0x9b27158900c8af88 + .quad 0x8065b668da59a736 + .quad 0x51e57bb6a2cc38bd + .quad 0x85ac326702ea4b71 + .quad 0xbe70e00341a1bb01 + .quad 0x53e4a24b083bc144 + .quad 0x10b8e91a9f0d61e3 + + // 7 * G + + .quad 0xba6f2c9aaa3221b1 + .quad 0x6ca021533bba23a7 + .quad 0x9dea764f92192c3a + .quad 0x1d6edd5d2e5317e0 + .quad 0x6b1a5cd0944ea3bf + .quad 0x7470353ab39dc0d2 + .quad 0x71b2528228542e49 + .quad 0x461bea69283c927e + .quad 0xf1836dc801b8b3a2 + .quad 0xb3035f47053ea49a + .quad 0x529c41ba5877adf3 + .quad 0x7a9fbb1c6a0f90a7 + + // 8 * G + + .quad 0xe2a75dedf39234d9 + .quad 0x963d7680e1b558f9 + .quad 0x2c2741ac6e3c23fb + .quad 0x3a9024a1320e01c3 + .quad 0x59b7596604dd3e8f + .quad 0x6cb30377e288702c + .quad 0xb1339c665ed9c323 + .quad 0x0915e76061bce52f + .quad 0xe7c1f5d9c9a2911a + .quad 0xb8a371788bcca7d7 + .quad 0x636412190eb62a32 + .quad 0x26907c5c2ecc4e95 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S new file mode 100644 index 0000000000..ad05eae1fb --- /dev/null +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -0,0 +1,2941 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Double scalar multiplication for edwards25519, fresh and base point +// Input scalar[4], point[8], bscalar[4]; output res[8] +// +// extern void edwards25519_scalarmuldouble_alt +// (uint64_t res[static 8],uint64_t scalar[static 4], +// uint64_t point[static 8],uint64_t bscalar[static 4]); +// +// Given scalar = n, point = P and bscalar = m, returns in res +// the point (X,Y) = n * P + m * B where B = (...,4/5) is +// the standard basepoint for the edwards25519 (Ed25519) curve. +// +// Both 256-bit coordinates of the input point P are implicitly +// reduced modulo 2^255-19 if they are not already in reduced form, +// but the conventional usage is that they *are* already reduced. +// The scalars can be arbitrary 256-bit numbers but may also be +// considered as implicitly reduced modulo the group order. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point, X3 = bscalar +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_scalarmuldouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_scalarmuldouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable home for the input result argument during the whole body + +#define res x25 + +// Additional pointer variables for local subroutines + +#define p0 x22 +#define p1 x23 +#define p2 x24 + +// Other variables that are only needed prior to the modular inverse. + +#define i x19 +#define bf x20 +#define cf x21 + +// Pointer-offset pairs for result and temporaries on stack with some aliasing. + +#define resx res, #(0*NUMSIZE) +#define resy res, #(1*NUMSIZE) + +#define scalar sp, #(0*NUMSIZE) +#define bscalar sp, #(1*NUMSIZE) + +#define btabent sp, #(2*NUMSIZE) +#define acc sp, #(5*NUMSIZE) +#define acc_x sp, #(5*NUMSIZE) +#define acc_y sp, #(6*NUMSIZE) +#define acc_z sp, #(7*NUMSIZE) +#define acc_w sp, #(8*NUMSIZE) + +#define tabent sp, #(9*NUMSIZE) + +#define tab sp, #(13*NUMSIZE) + +// Total size to reserve on the stack (excluding local subroutines) + +#define NSPACE (45*NUMSIZE) + +// Sub-references used in local subroutines with local stack + +#define x_0 p0, #0 +#define y_0 p0, #NUMSIZE +#define z_0 p0, #(2*NUMSIZE) +#define w_0 p0, #(3*NUMSIZE) + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) +#define w_1 p1, #(3*NUMSIZE) + +#define x_2 p2, #0 +#define y_2 p2, #NUMSIZE +#define z_2 p2, #(2*NUMSIZE) +#define w_2 p2, #(3*NUMSIZE) + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) +#define t5 sp, #(5*NUMSIZE) + +// Load 64-bit immediate into a register + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x7, x8, [P2]; \ + mul x12, x3, x7; \ + umulh x13, x3, x7; \ + mul x11, x3, x8; \ + umulh x14, x3, x8; \ + adds x13, x13, x11; \ + ldp x9, x10, [P2+16]; \ + mul x11, x3, x9; \ + umulh x15, x3, x9; \ + adcs x14, x14, x11; \ + mul x11, x3, x10; \ + umulh x16, x3, x10; \ + adcs x15, x15, x11; \ + adc x16, x16, xzr; \ + ldp x5, x6, [P1+16]; \ + mul x11, x4, x7; \ + adds x13, x13, x11; \ + mul x11, x4, x8; \ + adcs x14, x14, x11; \ + mul x11, x4, x9; \ + adcs x15, x15, x11; \ + mul x11, x4, x10; \ + adcs x16, x16, x11; \ + umulh x3, x4, x10; \ + adc x3, x3, xzr; \ + umulh x11, x4, x7; \ + adds x14, x14, x11; \ + umulh x11, x4, x8; \ + adcs x15, x15, x11; \ + umulh x11, x4, x9; \ + adcs x16, x16, x11; \ + adc x3, x3, xzr; \ + mul x11, x5, x7; \ + adds x14, x14, x11; \ + mul x11, x5, x8; \ + adcs x15, x15, x11; \ + mul x11, x5, x9; \ + adcs x16, x16, x11; \ + mul x11, x5, x10; \ + adcs x3, x3, x11; \ + umulh x4, x5, x10; \ + adc x4, x4, xzr; \ + umulh x11, x5, x7; \ + adds x15, x15, x11; \ + umulh x11, x5, x8; \ + adcs x16, x16, x11; \ + umulh x11, x5, x9; \ + adcs x3, x3, x11; \ + adc x4, x4, xzr; \ + mul x11, x6, x7; \ + adds x15, x15, x11; \ + mul x11, x6, x8; \ + adcs x16, x16, x11; \ + mul x11, x6, x9; \ + adcs x3, x3, x11; \ + mul x11, x6, x10; \ + adcs x4, x4, x11; \ + umulh x5, x6, x10; \ + adc x5, x5, xzr; \ + umulh x11, x6, x7; \ + adds x16, x16, x11; \ + umulh x11, x6, x8; \ + adcs x3, x3, x11; \ + umulh x11, x6, x9; \ + adcs x4, x4, x11; \ + adc x5, x5, xzr; \ + mov x7, #0x26; \ + mul x11, x7, x16; \ + umulh x9, x7, x16; \ + adds x12, x12, x11; \ + mul x11, x7, x3; \ + umulh x3, x7, x3; \ + adcs x13, x13, x11; \ + mul x11, x7, x4; \ + umulh x4, x7, x4; \ + adcs x14, x14, x11; \ + mul x11, x7, x5; \ + umulh x5, x7, x5; \ + adcs x15, x15, x11; \ + cset x16, cs; \ + adds x15, x15, x4; \ + adc x16, x16, x5; \ + cmn x15, x15; \ + orr x15, x15, #0x8000000000000000; \ + adc x8, x16, x16; \ + mov x7, #0x13; \ + madd x11, x7, x8, x7; \ + adds x12, x12, x11; \ + adcs x13, x13, x9; \ + adcs x14, x14, x3; \ + adcs x15, x15, xzr; \ + csel x7, x7, xzr, cc; \ + subs x12, x12, x7; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + and x15, x15, #0x7fffffffffffffff; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x7, x8, [P2]; \ + mul x12, x3, x7; \ + umulh x13, x3, x7; \ + mul x11, x3, x8; \ + umulh x14, x3, x8; \ + adds x13, x13, x11; \ + ldp x9, x10, [P2+16]; \ + mul x11, x3, x9; \ + umulh x15, x3, x9; \ + adcs x14, x14, x11; \ + mul x11, x3, x10; \ + umulh x16, x3, x10; \ + adcs x15, x15, x11; \ + adc x16, x16, xzr; \ + ldp x5, x6, [P1+16]; \ + mul x11, x4, x7; \ + adds x13, x13, x11; \ + mul x11, x4, x8; \ + adcs x14, x14, x11; \ + mul x11, x4, x9; \ + adcs x15, x15, x11; \ + mul x11, x4, x10; \ + adcs x16, x16, x11; \ + umulh x3, x4, x10; \ + adc x3, x3, xzr; \ + umulh x11, x4, x7; \ + adds x14, x14, x11; \ + umulh x11, x4, x8; \ + adcs x15, x15, x11; \ + umulh x11, x4, x9; \ + adcs x16, x16, x11; \ + adc x3, x3, xzr; \ + mul x11, x5, x7; \ + adds x14, x14, x11; \ + mul x11, x5, x8; \ + adcs x15, x15, x11; \ + mul x11, x5, x9; \ + adcs x16, x16, x11; \ + mul x11, x5, x10; \ + adcs x3, x3, x11; \ + umulh x4, x5, x10; \ + adc x4, x4, xzr; \ + umulh x11, x5, x7; \ + adds x15, x15, x11; \ + umulh x11, x5, x8; \ + adcs x16, x16, x11; \ + umulh x11, x5, x9; \ + adcs x3, x3, x11; \ + adc x4, x4, xzr; \ + mul x11, x6, x7; \ + adds x15, x15, x11; \ + mul x11, x6, x8; \ + adcs x16, x16, x11; \ + mul x11, x6, x9; \ + adcs x3, x3, x11; \ + mul x11, x6, x10; \ + adcs x4, x4, x11; \ + umulh x5, x6, x10; \ + adc x5, x5, xzr; \ + umulh x11, x6, x7; \ + adds x16, x16, x11; \ + umulh x11, x6, x8; \ + adcs x3, x3, x11; \ + umulh x11, x6, x9; \ + adcs x4, x4, x11; \ + adc x5, x5, xzr; \ + mov x7, #0x26; \ + mul x11, x7, x16; \ + umulh x9, x7, x16; \ + adds x12, x12, x11; \ + mul x11, x7, x3; \ + umulh x3, x7, x3; \ + adcs x13, x13, x11; \ + mul x11, x7, x4; \ + umulh x4, x7, x4; \ + adcs x14, x14, x11; \ + mul x11, x7, x5; \ + umulh x5, x7, x5; \ + adcs x15, x15, x11; \ + cset x16, cs; \ + adds x15, x15, x4; \ + adc x16, x16, x5; \ + cmn x15, x15; \ + bic x15, x15, #0x8000000000000000; \ + adc x8, x16, x16; \ + mov x7, #0x13; \ + mul x11, x7, x8; \ + adds x12, x12, x11; \ + adcs x13, x13, x9; \ + adcs x14, x14, x3; \ + adc x15, x15, xzr; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x11, x2, x5; \ + umulh x12, x2, x5; \ + mul x7, x2, x4; \ + umulh x6, x2, x4; \ + adds x10, x10, x7; \ + adcs x11, x11, x6; \ + mul x7, x3, x4; \ + umulh x6, x3, x4; \ + adc x6, x6, xzr; \ + adds x11, x11, x7; \ + mul x13, x4, x5; \ + umulh x14, x4, x5; \ + adcs x12, x12, x6; \ + mul x7, x3, x5; \ + umulh x6, x3, x5; \ + adc x6, x6, xzr; \ + adds x12, x12, x7; \ + adcs x13, x13, x6; \ + adc x14, x14, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + cset x6, cs; \ + umulh x7, x2, x2; \ + mul x8, x2, x2; \ + adds x9, x9, x7; \ + mul x7, x3, x3; \ + adcs x10, x10, x7; \ + umulh x7, x3, x3; \ + adcs x11, x11, x7; \ + mul x7, x4, x4; \ + adcs x12, x12, x7; \ + umulh x7, x4, x4; \ + adcs x13, x13, x7; \ + mul x7, x5, x5; \ + adcs x14, x14, x7; \ + umulh x7, x5, x5; \ + adc x6, x6, x7; \ + mov x3, #0x26; \ + mul x7, x3, x12; \ + umulh x4, x3, x12; \ + adds x8, x8, x7; \ + mul x7, x3, x13; \ + umulh x13, x3, x13; \ + adcs x9, x9, x7; \ + mul x7, x3, x14; \ + umulh x14, x3, x14; \ + adcs x10, x10, x7; \ + mul x7, x3, x6; \ + umulh x6, x3, x6; \ + adcs x11, x11, x7; \ + cset x12, cs; \ + adds x11, x11, x14; \ + adc x12, x12, x6; \ + cmn x11, x11; \ + bic x11, x11, #0x8000000000000000; \ + adc x2, x12, x12; \ + mov x3, #0x13; \ + mul x7, x3, x2; \ + adds x8, x8, x7; \ + adcs x9, x9, x4; \ + adcs x10, x10, x13; \ + adc x11, x11, xzr; \ + stp x8, x9, [P0]; \ + stp x10, x11, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + mov x4, #38; \ + csel x3, x4, xzr, lo; \ + subs x5, x5, x3; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbc x8, x8, xzr; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16] + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x7, x8, [P2]; \ + adds x3, x3, x7; \ + adcs x4, x4, x8; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P2+16]; \ + adcs x5, x5, x7; \ + adcs x6, x6, x8; \ + mov x9, #38; \ + csel x9, x9, xzr, cs; \ + adds x3, x3, x9; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adc x6, x6, xzr; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16] + +#define double_twice4(P0,P1) \ + ldp x3, x4, [P1]; \ + adds x3, x3, x3; \ + adcs x4, x4, x4; \ + ldp x5, x6, [P1+16]; \ + adcs x5, x5, x5; \ + adcs x6, x6, x6; \ + mov x9, #38; \ + csel x9, x9, xzr, cs; \ + adds x3, x3, x9; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adc x6, x6, xzr; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16] + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movz x0, #0xf159; \ + movz x1, #0xb156; \ + movz x2, #0xd130; \ + movz x3, #0xfce7; \ + movk x0, #0x26b2, lsl #16; \ + movk x1, #0x8283, lsl #16; \ + movk x2, #0xeef3, lsl #16; \ + movk x3, #0x56df, lsl #16; \ + movk x0, #0x9b94, lsl #32; \ + movk x1, #0x149a, lsl #32; \ + movk x2, #0x80f2, lsl #32; \ + movk x3, #0xd9dc, lsl #32; \ + movk x0, #0xebd6, lsl #48; \ + movk x1, #0x00e0, lsl #48; \ + movk x2, #0x198e, lsl #48; \ + movk x3, #0x2406, lsl #48; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16] + +S2N_BN_SYMBOL(edwards25519_scalarmuldouble_alt): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + stp x25, x30, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the output pointer to a stable place + + mov res, x0 + +// Copy scalars while recoding all 4-bit nybbles except the top +// one (bits 252..255) into signed 4-bit digits. This is essentially +// done just by adding the recoding constant 0x0888..888, after +// which all digits except the first have an implicit bias of -8, +// so 0 -> -8, 1 -> -7, ... 7 -> -1, 8 -> 0, 9 -> 1, ... 15 -> 7. +// (We could literally create 2s complement signed nybbles by +// XORing with the same constant 0x0888..888 afterwards, but it +// doesn't seem to make the end usage any simpler.) +// +// In order to ensure that the unrecoded top nybble (bits 252..255) +// does not become > 8 as a result of carries lower down from the +// recoding, we first (conceptually) subtract the group order iff +// the top digit of the scalar is > 2^63. In the implementation the +// reduction and recoding are combined by optionally using the +// modified recoding constant 0x0888...888 + (2^256 - group_order). + + movbig(x4,#0xc7f5, #0x6fb5, #0xa0d9, #0xe920) + movbig(x5,#0xe190, #0xb993, #0x70cb, #0xa1d5) + mov x7, #0x8888888888888888 + sub x6, x7, #1 + bic x8, x7, #0xF000000000000000 + + ldp x10, x11, [x3] + ldp x12, x13, [x3, #16] + mov x3, 0x8000000000000000 + cmp x3, x13 + csel x14, x7, x4, cs + csel x15, x7, x5, cs + csel x16, x7, x6, cs + csel x17, x8, x7, cs + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adc x13, x13, x17 + stp x10, x11, [bscalar] + stp x12, x13, [bscalar+16] + + ldp x10, x11, [x1] + ldp x12, x13, [x1, #16] + mov x3, 0x8000000000000000 + cmp x3, x13 + csel x14, x7, x4, cs + csel x15, x7, x5, cs + csel x16, x7, x6, cs + csel x17, x8, x7, cs + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adc x13, x13, x17 + stp x10, x11, [scalar] + stp x12, x13, [scalar+16] + +// Create table of multiples 1..8 of the general input point at "tab". +// Reduce the input coordinates x and y modulo 2^256 - 38 first, for the +// sake of definiteness; this is the reduction that will be maintained. +// We could slightly optimize the additions because we know the input +// point is affine (so Z = 1), but it doesn't seem worth the complication. + + ldp x10, x11, [x2] + ldp x12, x13, [x2, #16] + adds x14, x10, #38 + adcs x15, x11, xzr + adcs x16, x12, xzr + adcs x17, x13, xzr + csel x10, x14, x10, cs + csel x11, x15, x11, cs + csel x12, x16, x12, cs + csel x13, x17, x13, cs + stp x10, x11, [tab] + stp x12, x13, [tab+16] + + ldp x10, x11, [x2, #32] + ldp x12, x13, [x2, #48] + adds x14, x10, #38 + adcs x15, x11, xzr + adcs x16, x12, xzr + adcs x17, x13, xzr + csel x10, x14, x10, cs + csel x11, x15, x11, cs + csel x12, x16, x12, cs + csel x13, x17, x13, cs + stp x10, x11, [tab+32] + stp x12, x13, [tab+48] + + mov x1, #1 + stp x1, xzr, [tab+64] + stp xzr, xzr, [tab+80] + + add p0, tab+96 + add p1, tab + add p2, tab+32 + mul_4(x_0,x_1,x_2) + +// Multiple 2 + + add p0, tab+1*128 + add p1, tab + bl edwards25519_scalarmuldouble_alt_epdouble + +// Multiple 3 + + add p0, tab+2*128 + add p1, tab + add p2, tab+1*128 + bl edwards25519_scalarmuldouble_alt_epadd + +// Multiple 4 + + add p0, tab+3*128 + add p1, tab+1*128 + bl edwards25519_scalarmuldouble_alt_epdouble + +// Multiple 5 + + add p0, tab+4*128 + add p1, tab + add p2, tab+3*128 + bl edwards25519_scalarmuldouble_alt_epadd + +// Multiple 6 + + add p0, tab+5*128 + add p1, tab+2*128 + bl edwards25519_scalarmuldouble_alt_epdouble + +// Multiple 7 + + add p0, tab+6*128 + add p1, tab + add p2, tab+5*128 + bl edwards25519_scalarmuldouble_alt_epadd + +// Multiple 8 + + add p0, tab+7*128 + add p1, tab+3*128 + bl edwards25519_scalarmuldouble_alt_epdouble + +// Handle the initialization, starting the loop counter at i = 252 +// and initializing acc to the sum of the table entries for the +// top nybbles of the scalars (the ones with no implicit -8 bias). + + mov i, #252 + +// Index for btable entry... + + ldr x0, [bscalar+24] + lsr bf, x0, #60 + +// ...and constant-time indexing based on that index + + adr x14, edwards25519_scalarmuldouble_alt_table + + mov x0, #1 + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + + cmp bf, #1 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #2 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #3 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #4 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #5 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #6 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #7 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #8 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + + stp x0, x1, [btabent] + stp x2, x3, [btabent+16] + stp x4, x5, [btabent+32] + stp x6, x7, [btabent+48] + stp x8, x9, [btabent+64] + stp x10, x11, [btabent+80] + +// Index for table entry... + + ldr x0, [scalar+24] + lsr bf, x0, #60 + +// ...and constant-time indexing based on that index + + add p0, tab + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, #1 + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + + cmp bf, #1 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #2 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #3 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #4 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #5 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #6 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #7 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #8 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + stp x12, x13, [tabent+96] + stp x14, x15, [tabent+112] + +// Add those elements to initialize the accumulator for bit position 252 + + add p0, acc + add p1, tabent + add p2, btabent + bl edwards25519_scalarmuldouble_alt_pepadd + +// Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint +// Start with i = 252 for bits 248..251 and go down four at a time to 3..0 + +edwards25519_scalarmuldouble_alt_loop: + + sub i, i, #4 + +// Double to acc' = 2 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_alt_pdouble + +// Get btable entry, first getting the adjusted bitfield... + + lsr x0, i, #6 + add x1, bscalar + ldr x2, [x1, x0, lsl #3] + lsr x3, x2, i + and x0, x3, #15 + subs bf, x0, #8 + cneg bf, bf, cc + csetm cf, cc + +// ... then doing constant-time lookup with the appropriate index... + + adr x14, edwards25519_scalarmuldouble_alt_table + + mov x0, #1 + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + + cmp bf, #1 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #2 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #3 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #4 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #5 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #6 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #7 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + add x14, x14, #96 + + cmp bf, #8 + ldp x12, x13, [x14] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x12, x13, [x14, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x12, x13, [x14, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x12, x13, [x14, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x12, x13, [x14, #64] + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x12, x13, [x14, #80] + csel x10, x10, x12, ne + csel x11, x11, x13, ne + +// ... then optionally negating before storing. The table entry +// is in precomputed form and we currently have +// +// [x3;x2;x1;x0] = y - x +// [x7;x6;x5;x4] = x + y +// [x11;x10;x9;x8] = 2 * d * x * y +// +// Negation for Edwards curves is -(x,y) = (-x,y), which in this modified +// form amounts to swapping the first two fields and negating the third. +// The negation does not always fully reduce even mod 2^256-38 in the zero +// case, instead giving -0 = 2^256-38. But that is fine since the result is +// always fed to a multipliction inside the "pepadd" function below that +// handles any 256-bit input. + + cmp cf, xzr + + csel x12, x0, x4, eq + csel x4, x0, x4, ne + csel x13, x1, x5, eq + csel x5, x1, x5, ne + csel x14, x2, x6, eq + csel x6, x2, x6, ne + csel x15, x3, x7, eq + csel x7, x3, x7, ne + + eor x8, x8, cf + eor x9, x9, cf + eor x10, x10, cf + eor x11, x11, cf + mov x0, #37 + and x0, x0, cf + subs x8, x8, x0 + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbc x11, x11, xzr + + stp x12, x13, [btabent] + stp x14, x15, [btabent+16] + stp x4, x5, [btabent+32] + stp x6, x7, [btabent+48] + stp x8, x9, [btabent+64] + stp x10, x11, [btabent+80] + +// Get table entry, first getting the adjusted bitfield... + + lsr x0, i, #6 + ldr x1, [sp, x0, lsl #3] + lsr x2, x1, i + and x0, x2, #15 + subs bf, x0, #8 + cneg bf, bf, cc + csetm cf, cc + +// ... then getting the unadjusted table entry + + add p0, tab + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, #1 + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + + cmp bf, #1 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #2 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #3 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #4 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #5 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #6 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #7 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + add p0, p0, #128 + + cmp bf, #8 + ldp x16, x17, [p0] + csel x0, x0, x16, ne + csel x1, x1, x17, ne + ldp x16, x17, [p0, #16] + csel x2, x2, x16, ne + csel x3, x3, x17, ne + ldp x16, x17, [p0, #32] + csel x4, x4, x16, ne + csel x5, x5, x17, ne + ldp x16, x17, [p0, #48] + csel x6, x6, x16, ne + csel x7, x7, x17, ne + ldp x16, x17, [p0, #64] + csel x8, x8, x16, ne + csel x9, x9, x17, ne + ldp x16, x17, [p0, #80] + csel x10, x10, x16, ne + csel x11, x11, x17, ne + ldp x16, x17, [p0, #96] + csel x12, x12, x16, ne + csel x13, x13, x17, ne + ldp x16, x17, [p0, #112] + csel x14, x14, x16, ne + csel x15, x15, x17, ne + +// ... then optionally negating before storing. This time the table +// entry is extended-projective, and is in registers thus: +// +// [x3;x2;x1;x0] = X +// [x7;x6;x5;x4] = Y +// [x11;x10;x9;x8] = Z +// [x15;x14;x13;x12] = W +// +// This time we just need to negate the X and the W fields. +// The crude way negation is done can result in values of X or W +// (when initially zero before negation) being exactly equal to +// 2^256-38, but the "pepadd" function handles that correctly. + + eor x0, x0, cf + eor x1, x1, cf + eor x2, x2, cf + eor x3, x3, cf + mov x16, #37 + and x16, x16, cf + subs x0, x0, x16 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x3, x3, xzr + + eor x12, x12, cf + eor x13, x13, cf + eor x14, x14, cf + eor x15, x15, cf + subs x12, x12, x16 + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbc x15, x15, xzr + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + stp x12, x13, [tabent+96] + stp x14, x15, [tabent+112] + +// Double to acc' = 4 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_alt_pdouble + +// Add tabent := tabent + btabent + + add p0, tabent + add p1, tabent + add p2, btabent + bl edwards25519_scalarmuldouble_alt_pepadd + +// Double to acc' = 8 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_alt_pdouble + +// Double to acc' = 16 * acc + + add p0, acc + add p1, acc + bl edwards25519_scalarmuldouble_alt_epdouble + +// Add table entry, acc := acc + tabent + + add p0, acc + add p1, acc + add p2, tabent + bl edwards25519_scalarmuldouble_alt_epadd + +// Loop down + + cbnz i, edwards25519_scalarmuldouble_alt_loop + +// Modular inverse setup + + add x0, tabent + add x1, acc+64 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, acc, tabent. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmuldouble_alt_invmidloop +edwards25519_scalarmuldouble_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmuldouble_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmuldouble_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] + +// Store result. Note that these are the only reductions mod 2^255-19 + + mov p0, res + add p1, acc + add p2, tabent + mul_p25519(x_0,x_1,x_2) + + add p0, res, #32 + add p1, acc+32 + add p2, tabent + mul_p25519(x_0,x_1,x_2) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x25, x30, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +// **************************************************************************** +// Localized versions of subroutines. +// These are close to the standalone functions "edwards25519_epdouble" etc., +// but are only maintaining reduction modulo 2^256 - 38, not 2^255 - 19. +// **************************************************************************** + +edwards25519_scalarmuldouble_alt_epdouble: + sub sp, sp, #(5*NUMSIZE) + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(w_0,t1,t4) + mul_4(x_0,t1,t3) + add sp, sp, #(5*NUMSIZE) + ret + +edwards25519_scalarmuldouble_alt_pdouble: + sub sp, sp, #(5*NUMSIZE) + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(x_0,t1,t3) + add sp, sp, #(5*NUMSIZE) + ret + +edwards25519_scalarmuldouble_alt_epadd: + sub sp, sp, #(6*NUMSIZE) + mul_4(t0,w_1,w_2) + sub_twice4(t1,y_1,x_1) + sub_twice4(t2,y_2,x_2) + add_twice4(t3,y_1,x_1) + add_twice4(t4,y_2,x_2) + double_twice4(t5,z_2) + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + load_k25519(t2) + mul_4(t2,t2,t0) + mul_4(t4,z_1,t5) + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + mul_4(w_0,t0,t5) + mul_4(x_0,t0,t1) + mul_4(y_0,t3,t5) + mul_4(z_0,t1,t3) + add sp, sp, #(6*NUMSIZE) + ret + +edwards25519_scalarmuldouble_alt_pepadd: + sub sp, sp, #(6*NUMSIZE) + double_twice4(t0,z_1); + sub_twice4(t1,y_1,x_1); + add_twice4(t2,y_1,x_1); + mul_4(t3,w_1,z_2); + mul_4(t1,t1,x_2); + mul_4(t2,t2,y_2); + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + mul_4(z_0,t4,t0); + mul_4(x_0,t5,t4); + mul_4(y_0,t0,t1); + mul_4(w_0,t5,t1); + add sp, sp, #(6*NUMSIZE) + ret + +// **************************************************************************** +// The precomputed data (all read-only). This is currently part of the same +// text section, which gives position-independent code with simple PC-relative +// addressing. However it could be put in a separate section via something like +// +// .section .rodata +// **************************************************************************** + +// Precomputed table of multiples of generator for edwards25519 +// all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. + +edwards25519_scalarmuldouble_alt_table: + + // 1 * G + + .quad 0x9d103905d740913e + .quad 0xfd399f05d140beb3 + .quad 0xa5c18434688f8a09 + .quad 0x44fd2f9298f81267 + .quad 0x2fbc93c6f58c3b85 + .quad 0xcf932dc6fb8c0e19 + .quad 0x270b4898643d42c2 + .quad 0x07cf9d3a33d4ba65 + .quad 0xabc91205877aaa68 + .quad 0x26d9e823ccaac49e + .quad 0x5a1b7dcbdd43598c + .quad 0x6f117b689f0c65a8 + + // 2 * G + + .quad 0x8a99a56042b4d5a8 + .quad 0x8f2b810c4e60acf6 + .quad 0xe09e236bb16e37aa + .quad 0x6bb595a669c92555 + .quad 0x9224e7fc933c71d7 + .quad 0x9f469d967a0ff5b5 + .quad 0x5aa69a65e1d60702 + .quad 0x590c063fa87d2e2e + .quad 0x43faa8b3a59b7a5f + .quad 0x36c16bdd5d9acf78 + .quad 0x500fa0840b3d6a31 + .quad 0x701af5b13ea50b73 + + // 3 * G + + .quad 0x56611fe8a4fcd265 + .quad 0x3bd353fde5c1ba7d + .quad 0x8131f31a214bd6bd + .quad 0x2ab91587555bda62 + .quad 0xaf25b0a84cee9730 + .quad 0x025a8430e8864b8a + .quad 0xc11b50029f016732 + .quad 0x7a164e1b9a80f8f4 + .quad 0x14ae933f0dd0d889 + .quad 0x589423221c35da62 + .quad 0xd170e5458cf2db4c + .quad 0x5a2826af12b9b4c6 + + // 4 * G + + .quad 0x95fe050a056818bf + .quad 0x327e89715660faa9 + .quad 0xc3e8e3cd06a05073 + .quad 0x27933f4c7445a49a + .quad 0x287351b98efc099f + .quad 0x6765c6f47dfd2538 + .quad 0xca348d3dfb0a9265 + .quad 0x680e910321e58727 + .quad 0x5a13fbe9c476ff09 + .quad 0x6e9e39457b5cc172 + .quad 0x5ddbdcf9102b4494 + .quad 0x7f9d0cbf63553e2b + + // 5 * G + + .quad 0x7f9182c3a447d6ba + .quad 0xd50014d14b2729b7 + .quad 0xe33cf11cb864a087 + .quad 0x154a7e73eb1b55f3 + .quad 0xa212bc4408a5bb33 + .quad 0x8d5048c3c75eed02 + .quad 0xdd1beb0c5abfec44 + .quad 0x2945ccf146e206eb + .quad 0xbcbbdbf1812a8285 + .quad 0x270e0807d0bdd1fc + .quad 0xb41b670b1bbda72d + .quad 0x43aabe696b3bb69a + + // 6 * G + + .quad 0x499806b67b7d8ca4 + .quad 0x575be28427d22739 + .quad 0xbb085ce7204553b9 + .quad 0x38b64c41ae417884 + .quad 0x3a0ceeeb77157131 + .quad 0x9b27158900c8af88 + .quad 0x8065b668da59a736 + .quad 0x51e57bb6a2cc38bd + .quad 0x85ac326702ea4b71 + .quad 0xbe70e00341a1bb01 + .quad 0x53e4a24b083bc144 + .quad 0x10b8e91a9f0d61e3 + + // 7 * G + + .quad 0xba6f2c9aaa3221b1 + .quad 0x6ca021533bba23a7 + .quad 0x9dea764f92192c3a + .quad 0x1d6edd5d2e5317e0 + .quad 0x6b1a5cd0944ea3bf + .quad 0x7470353ab39dc0d2 + .quad 0x71b2528228542e49 + .quad 0x461bea69283c927e + .quad 0xf1836dc801b8b3a2 + .quad 0xb3035f47053ea49a + .quad 0x529c41ba5877adf3 + .quad 0x7a9fbb1c6a0f90a7 + + // 8 * G + + .quad 0xe2a75dedf39234d9 + .quad 0x963d7680e1b558f9 + .quad 0x2c2741ac6e3c23fb + .quad 0x3a9024a1320e01c3 + .quad 0x59b7596604dd3e8f + .quad 0x6cb30377e288702c + .quad 0xb1339c665ed9c323 + .quad 0x0915e76061bce52f + .quad 0xe7c1f5d9c9a2911a + .quad 0xb8a371788bcca7d7 + .quad 0x636412190eb62a32 + .quad 0x26907c5c2ecc4e95 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/curve25519/bignum_mod_n25519.S b/x86_att/curve25519/bignum_mod_n25519.S new file mode 100644 index 0000000000..c45d99b541 --- /dev/null +++ b/x86_att/curve25519/bignum_mod_n25519.S @@ -0,0 +1,228 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n25519 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the order of the curve25519/edwards25519 basepoint, +// which is n_25519 = 2^252 + 27742317777372353535851937790883648493 +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define q %rbx + +S2N_BN_SYMBOL(bignum_mod_n25519): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %rbp + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 +// This [m3;m2;m1;m0] is the initial x where we begin reduction. + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies + + movq %rdx, x + +// Get the quotient estimate q = floor(x/2^252). +// Also delete it from m3, in effect doing x' = x - q * 2^252 + + movq m3, q + shrq $60, q + + shlq $4, m3 + shrq $4, m3 + +// Let [%rdx;d;%rbp] = q * (n_25519 - 2^252) + + movq $0x5812631a5cf5d3ed, %rax + mulq q + movq %rax, %rbp + movq %rdx, d + + movq $0x14def9dea2f79cd6, %rax + mulq q + addq %rax, d + adcq $0, %rdx + +// Subtract to get x' - q * (n_25519 - 2^252) = x - q * n_25519 + + subq %rbp, m0 + sbbq d, m1 + sbbq %rdx, m2 + sbbq $0, m3 + +// Get a bitmask for the borrow and create a masked version of +// non-trivial digits of [%rbx;0;%rdx;%rax] = n_25519, then add it. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + sbbq %rbx, %rbx + + movq $0x5812631a5cf5d3ed, %rax + andq %rbx, %rax + movq $0x14def9dea2f79cd6, %rdx + andq %rbx, %rdx + movq $0x1000000000000000, %rbx + andq %rax, %rbx + + addq %rax, m0 + adcq %rdx, m1 + adcq $0, m2 + adcq %rbx, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction. Each one +// is similar to the sequence above except for the more refined quotient +// estimation process. + + testq k, k + jz writeback + +loop: + +// Assume that the new 5-digit x is 2^64 * previous_x + next_digit. +// Get the quotient estimate q = max (floor(x/2^252)) (2^64 - 1) +// and first compute x' = x - 2^252 * q. + + movq m3, q + shldq $4, m2, q + shrq $60, m3 + subq m3, q + shlq $4, m2 + shrdq $4, m3, m2 + +// Let [%rdx;m3;%rbp] = q * (n_25519 - 2^252) + + movq $0x5812631a5cf5d3ed, %rax + mulq q + movq %rax, %rbp + movq %rdx, m3 + + movq $0x14def9dea2f79cd6, %rax + mulq q + addq %rax, m3 + adcq $0, %rdx + +// Load the next digit + + movq -8(x,k,8), d + +// Subtract to get x' - q * (n_25519 - 2^252) = x - q * n_25519 + + subq %rbp, d + sbbq m3, m0 + sbbq %rdx, m1 + sbbq $0, m2 + +// Get a bitmask for the borrow and create a masked version of +// non-trivial digits of [%rbx;0;%rdx;%rax] = n_25519, then add it. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + sbbq %rbx, %rbx + + movq $0x5812631a5cf5d3ed, %rax + andq %rbx, %rax + movq $0x14def9dea2f79cd6, %rdx + andq %rbx, %rdx + movq $0x1000000000000000, %rbx + andq %rax, %rbx + + addq %rax, d + adcq %rdx, m0 + adcq $0, m1 + adcq %rbx, m2 + +// Now shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz loop + +// Write back + +writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbp + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz writeback + movq (%rdx), m0 + decq k + jz writeback + movq 8(%rdx), m1 + decq k + jz writeback + movq 16(%rdx), m2 + jmp writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/x86_att/curve25519/curve25519_x25519.S b/x86_att/curve25519/curve25519_x25519.S index 9914fdd01c..b46c522b36 100644 --- a/x86_att/curve25519/curve25519_x25519.S +++ b/x86_att/curve25519/curve25519_x25519.S @@ -66,12 +66,12 @@ #define sn (4*NUMSIZE)(%rsp) -#define zn (5*NUMSIZE)(%rsp) #define dn (5*NUMSIZE)(%rsp) #define e (5*NUMSIZE)(%rsp) #define dmsn (6*NUMSIZE)(%rsp) #define p (6*NUMSIZE)(%rsp) +#define zn (7*NUMSIZE)(%rsp) #define xm (8*NUMSIZE)(%rsp) #define dnsm (8*NUMSIZE)(%rsp) @@ -791,430 +791,1372 @@ curve25519_x25519_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - movq $-19, %rax - movq $-1, %rcx - movq $0x7fffffffffffffff, %rdx - movq %rax, 128(%rsp) - movq %rcx, 136(%rsp) - movq %rcx, 144(%rsp) - movq %rdx, 152(%rsp) - -// Prepare to call the modular inverse function to get zm = 1/zn - - movq $4, %rdi - leaq 96(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq 128(%rsp), %rcx - leaq 192(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites pointx, scalar and dm, which are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp +// Prepare to call the modular inverse function to get xm = 1/zn + + leaq 256(%rsp), %rdi + leaq 224(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_wmontend -curve25519_x25519_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519_wmontloop -curve25519_x25519_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_zmontend -curve25519_x25519_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519_zmontloop -curve25519_x25519_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a // dependency on the behavior of modular inverse in out-of-scope cases. - movq 160(%rsp), %rax - orq 168(%rsp), %rax - orq 176(%rsp), %rax - orq 184(%rsp), %rax + movq 224(%rsp), %rax + orq 232(%rsp), %rax + orq 240(%rsp), %rax + orq 248(%rsp), %rax movq 320(%rsp), %rcx cmovzq %rax, %rcx movq %rcx, 320(%rsp) @@ -1231,7 +2173,7 @@ curve25519_x25519_fliploop: // Now the result is xn * (1/zn), fully reduced modulo p. movq res, %rbp - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/x86_att/curve25519/curve25519_x25519_alt.S b/x86_att/curve25519/curve25519_x25519_alt.S index ca92a9206a..dd644dbba9 100644 --- a/x86_att/curve25519/curve25519_x25519_alt.S +++ b/x86_att/curve25519/curve25519_x25519_alt.S @@ -66,12 +66,12 @@ #define sn (4*NUMSIZE)(%rsp) -#define zn (5*NUMSIZE)(%rsp) #define dn (5*NUMSIZE)(%rsp) #define e (5*NUMSIZE)(%rsp) #define dmsn (6*NUMSIZE)(%rsp) #define p (6*NUMSIZE)(%rsp) +#define zn (7*NUMSIZE)(%rsp) #define xm (8*NUMSIZE)(%rsp) #define dnsm (8*NUMSIZE)(%rsp) @@ -952,430 +952,1372 @@ curve25519_x25519_alt_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - movq $-19, %rax - movq $-1, %rcx - movq $0x7fffffffffffffff, %rdx - movq %rax, 128(%rsp) - movq %rcx, 136(%rsp) - movq %rcx, 144(%rsp) - movq %rdx, 152(%rsp) - -// Prepare to call the modular inverse function to get zm = 1/zn - - movq $4, %rdi - leaq 96(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq 128(%rsp), %rcx - leaq 192(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites pointx, scalar and dm, which are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp +// Prepare to call the modular inverse function to get xm = 1/zn + + leaq 256(%rsp), %rdi + leaq 224(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_alt_midloop +curve25519_x25519_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_alt_wmontend -curve25519_x25519_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519_alt_wmontloop -curve25519_x25519_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_alt_zmontend -curve25519_x25519_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519_alt_zmontloop -curve25519_x25519_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a // dependency on the behavior of modular inverse in out-of-scope cases. - movq 160(%rsp), %rax - orq 168(%rsp), %rax - orq 176(%rsp), %rax - orq 184(%rsp), %rax + movq 224(%rsp), %rax + orq 232(%rsp), %rax + orq 240(%rsp), %rax + orq 248(%rsp), %rax movq 320(%rsp), %rcx cmovzq %rax, %rcx movq %rcx, 320(%rsp) @@ -1392,7 +2334,7 @@ curve25519_x25519_alt_fliploop: // Now the result is xn * (1/zn), fully reduced modulo p. movq res, %rbp - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/x86_att/curve25519/curve25519_x25519base.S b/x86_att/curve25519/curve25519_x25519base.S index 12a5cddd18..e450656861 100644 --- a/x86_att/curve25519/curve25519_x25519base.S +++ b/x86_att/curve25519/curve25519_x25519base.S @@ -874,416 +874,1368 @@ curve25519_x25519base_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - movq $4, %rdi - leaq 128(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq curve25519_x25519base_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq 256(%rsp), %rdi + leaq 320(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519base_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519base_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519base_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519base_midloop +curve25519_x25519base_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519base_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_wmontend -curve25519_x25519base_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519base_wmontloop -curve25519_x25519base_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_zmontend -curve25519_x25519base_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519base_zmontloop -curve25519_x25519base_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519base_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519base_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519base_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519base_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519base_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519base_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519base_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519base_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that @@ -1291,7 +2243,7 @@ curve25519_x25519base_fliploop: // answer as output. movq res, %rbp - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1313,14 +2265,6 @@ curve25519_x25519base_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -curve25519_x25519base_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/x86_att/curve25519/curve25519_x25519base_alt.S b/x86_att/curve25519/curve25519_x25519base_alt.S index 8a89b1f597..b1275e2084 100644 --- a/x86_att/curve25519/curve25519_x25519base_alt.S +++ b/x86_att/curve25519/curve25519_x25519base_alt.S @@ -950,414 +950,1368 @@ curve25519_x25519base_alt_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) - -// Prepare to call the modular inverse function to get x_3 = 1/z_3 - - movq $4, %rdi - leaq 128(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq curve25519_x25519base_alt_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) + +// Prepare to call the modular inverse function to get t0 = 1/t2 +// Note that this works for the weakly normalized z_3 equally well. +// The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. + + leaq 256(%rsp), %rdi + leaq 320(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519base_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519base_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519base_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519base_alt_midloop +curve25519_x25519base_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519base_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_alt_wmontend -curve25519_x25519base_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_wmontloop -curve25519_x25519base_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_alt_zmontend -curve25519_x25519base_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_zmontloop -curve25519_x25519base_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519base_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519base_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519base_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519base_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519base_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519base_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519base_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519base_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that @@ -1365,7 +2319,7 @@ curve25519_x25519base_alt_fliploop: // answer as output. movq res, %rbp - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1387,14 +2341,6 @@ curve25519_x25519base_alt_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -curve25519_x25519base_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/x86_att/curve25519/edwards25519_decode.S b/x86_att/curve25519/edwards25519_decode.S new file mode 100644 index 0000000000..05681925a3 --- /dev/null +++ b/x86_att/curve25519/edwards25519_decode.S @@ -0,0 +1,670 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard x86-64 ABI: RDI = z, RSI = c +// Microsoft x64 ABI: RCX = z, RDX = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y 0(%rsp) +#define s (4*N)(%rsp) +#define t (8*N)(%rsp) +#define u (12*N)(%rsp) +#define v (16*N)(%rsp) +#define w (20*N)(%rsp) +#define q (24*N)(%rsp) +#define res (28*N)(%rsp) +#define sgnbit (29*N)(%rsp) +#define badun (30*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (32*N) + +// Corrupted versions when stack is down 8 more + +#define q8 (25*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define Y 0 +#define S (4*N) +#define T (8*N) +#define U (12*N) +#define V (16*N) +#define W (20*N) +#define Q8 (25*N) + +S2N_BN_SYMBOL(edwards25519_decode): + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq edwards25519_decode_standard + popq %rsi + popq %rdi + ret + +edwards25519_decode_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Load the inputs, which can be done word-wise since x86 is little-endian. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + movq (%rsi), %rax + movq %rax, Y(%rsp) + movq 8(%rsi), %rbx + movq %rbx, Y+8(%rsp) + xorl %ebp, %ebp + movq 16(%rsi), %rcx + movq %rcx, Y+16(%rsp) + movq 24(%rsi), %rdx + btr $63, %rdx + movq %rdx, Y+24(%rsp) + adcq %rbp, %rbp + movq %rbp, sgnbit + + addq $19, %rax + adcq $0, %rbx + adcq $0, %rcx + adcq $0, %rdx + shrq $63, %rdx + movq %rdx, badun + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq Y(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + movq V(%rsp), %rax + subq $20, %rax + movq V+8(%rsp), %rbx + sbbq $0, %rbx + movq V+16(%rsp), %rcx + sbbq $0, %rcx + movq V+24(%rsp), %rdx + sbbq $0, %rdx + btc $63, %rdx + movq %rax, U(%rsp) + movq %rbx, U+8(%rsp) + movq %rcx, U+16(%rsp) + movq %rdx, U+24(%rsp) + + movq $0x75eb4dca135978a3, %rax + movq %rax, W(%rsp) + movq $0x00700a4d4141d8ab, %rax + movq %rax, W+8(%rsp) + movq $0x8cc740797779e898, %rax + movq %rax, W+16(%rsp) + movq $0x52036cee2b6ffe73, %rax + movq %rax, W+24(%rsp) + leaq V(%rsp), %rdi + leaq W(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + movq V(%rsp), %rax + addq $1, %rax + movq V+8(%rsp), %rbx + adcq $0, %rbx + movq V+16(%rsp), %rcx + adcq $0, %rcx + movq V+24(%rsp), %rdx + adcq $0, %rdx + movq %rax, V(%rsp) + movq %rbx, V+8(%rsp) + movq %rcx, V+16(%rsp) + movq %rdx, V+24(%rsp) + + leaq W(%rsp), %rdi + leaq U(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq S(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq V(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + leaq S(%rsp), %rdi + leaq U(%rsp), %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// %rax = 0 <=> s^2 * w = 0 or 1 + + movq V(%rsp), %r8 + movq V+8(%rsp), %r9 + movq V+16(%rsp), %r10 + movq V+24(%rsp), %r11 + movl $1, %eax + notq %rax + andq %r8, %rax + orq %r9, %rax + orq %r10, %rax + orq %r11, %rax + +// %r8 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + addq $20, %r8 + notq %r9 + notq %r10 + bts $63, %r11 + addq $1, %r11 + orq %r9, %r8 + orq %r11, %r10 + orq %r10, %r8 + +// If s^2 * w is not 0 or 1 then replace s by t + + testq %rax, %rax + + movq S(%rsp), %r12 + movq T(%rsp), %rbx + cmovnzq %rbx, %r12 + movq S+8(%rsp), %r13 + movq T+8(%rsp), %rbx + cmovnzq %rbx, %r13 + movq S+16(%rsp), %r14 + movq T+16(%rsp), %rbx + cmovnzq %rbx, %r14 + movq S+24(%rsp), %r15 + movq T+24(%rsp), %rbx + cmovnzq %rbx, %r15 + movq %r12, S(%rsp) + movq %r13, S+8(%rsp) + movq %r14, S+16(%rsp) + movq %r15, S+24(%rsp) + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + cmovzq %rax, %r8 + negq %r8 + sbbq %r8, %r8 + negq %r8 + orq %r8, badun + +// Let [%r11;%r10;%r9;%r8] = s and [%r15;%r14;%r13;%r12] = p_25519 - s + + movq S(%rsp), %r8 + movq $-19, %r12 + subq %r8, %r12 + movq S+8(%rsp), %r9 + movq $-1, %r13 + sbbq %r9, %r13 + movq S+16(%rsp), %r10 + movq $-1, %r14 + sbbq %r10, %r14 + movq S+24(%rsp), %r11 + movq $0x7FFFFFFFFFFFFFFF, %r15 + sbbq %r11, %r15 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + movl $1, %ecx + andq %r8, %rcx + xorq sgnbit, %rcx + movq badun, %rdx + movq %rdx, %rsi + orq %rcx, %rdx + xorl %ebp, %ebp + movq %r8, %rax + movq %r9, %rbx + orq %r10, %rax + orq %r11, %rbx + orq %rbx, %rax + cmovzq %rbp, %rcx + cmovnzq %rsi, %rdx + +// Actual selection of x as s or -s, copying of y and return of validity + + testq %rcx, %rcx + + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq Y(%rsp), %rcx + movq %rcx, 32(%rdi) + movq Y+8(%rsp), %rcx + movq %rcx, 40(%rdi) + movq Y+16(%rsp), %rcx + movq %rcx, 48(%rdi) + movq Y+24(%rsp), %rcx + movq %rcx, 56(%rdi) + + movq %rdx, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_mul_p25519: + movq %rdx, %rcx + xorl %ebp, %ebp + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsi), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsi), %rax, %r12 + adcq %rax, %r11 + adcq %rbp, %r12 + xorl %ebp, %ebp + movq 0x8(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + xorl %ebp, %ebp + movq 0x10(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + movq 0x18(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rcx, %r15 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movl $0x26, %edx + mulxq %r15, %rax, %rbx + adcxq %rcx, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + addq %r11, %rax + adcq %rbp, %rbx + btq $0x3f, %rax + adcq %rbx, %rbx + leaq 0x1(%rbx), %rcx + imulq $0x13, %rcx, %rcx + xorl %ebp, %ebp + adoxq %rcx, %r8 + mulxq %r12, %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq %r13, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r14, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq %r15, %rax, %rbx + adcq %rax, %r11 + shlq $0x3f, %rcx + cmpq %rcx, %r11 + movl $0x13, %eax + cmovns %rbp, %rax + subq %rax, %r8 + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rbp, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_nsqr_p25519: + +// Copy input argument into q + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, Q8(%rsp) + movq %rbx, Q8+8(%rsp) + movq %rcx, Q8+16(%rsp) + movq %rdx, Q8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_loop: + movq Q8(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq Q8+0x8(%rsp), %r9, %r10 + mulxq Q8+0x18(%rsp), %r11, %r12 + movq Q8+0x10(%rsp), %rdx + mulxq Q8+0x18(%rsp), %r13, %r14 + xorl %ebx, %ebx + mulxq Q8(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq Q8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + movq Q8+0x18(%rsp), %rdx + mulxq Q8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rbx, %r13 + adoxq %rbx, %r14 + adcq %rbx, %r14 + xorl %ebx, %ebx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq Q8+0x8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq Q8+0x10(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq Q8+0x18(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbx, %r15 + adoxq %rbx, %r15 + movl $0x26, %edx + xorl %ebx, %ebx + mulxq %r12, %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq %r13, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r14, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq %r15, %rax, %r12 + adcxq %rax, %r11 + adoxq %rbx, %r12 + adcxq %rbx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rbx, %r9 + adcq %rbx, %r10 + adcq %rbx, %r11 + movq %r8, Q8(%rsp) + movq %r9, Q8+0x8(%rsp) + movq %r10, Q8+0x10(%rsp) + movq %r11, Q8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz edwards25519_decode_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/curve25519/edwards25519_decode_alt.S b/x86_att/curve25519/edwards25519_decode_alt.S new file mode 100644 index 0000000000..570b2f9081 --- /dev/null +++ b/x86_att/curve25519/edwards25519_decode_alt.S @@ -0,0 +1,751 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard x86-64 ABI: RDI = z, RSI = c +// Microsoft x64 ABI: RCX = z, RDX = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode_alt) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y 0(%rsp) +#define s (4*N)(%rsp) +#define t (8*N)(%rsp) +#define u (12*N)(%rsp) +#define v (16*N)(%rsp) +#define w (20*N)(%rsp) +#define q (24*N)(%rsp) +#define res (28*N)(%rsp) +#define sgnbit (29*N)(%rsp) +#define badun (30*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (32*N) + +// Corrupted versions when stack is down 8 more + +#define q8 (25*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define Y 0 +#define S (4*N) +#define T (8*N) +#define U (12*N) +#define V (16*N) +#define W (20*N) +#define Q8 (25*N) + +S2N_BN_SYMBOL(edwards25519_decode_alt): + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq edwards25519_decode_alt_standard + popq %rsi + popq %rdi + ret + +edwards25519_decode_alt_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Load the inputs, which can be done word-wise since x86 is little-endian. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + movq (%rsi), %rax + movq %rax, Y(%rsp) + movq 8(%rsi), %rbx + movq %rbx, Y+8(%rsp) + xorl %ebp, %ebp + movq 16(%rsi), %rcx + movq %rcx, Y+16(%rsp) + movq 24(%rsi), %rdx + btr $63, %rdx + movq %rdx, Y+24(%rsp) + adcq %rbp, %rbp + movq %rbp, sgnbit + + addq $19, %rax + adcq $0, %rbx + adcq $0, %rcx + adcq $0, %rdx + shrq $63, %rdx + movq %rdx, badun + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq Y(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + movq V(%rsp), %rax + subq $20, %rax + movq V+8(%rsp), %rbx + sbbq $0, %rbx + movq V+16(%rsp), %rcx + sbbq $0, %rcx + movq V+24(%rsp), %rdx + sbbq $0, %rdx + btc $63, %rdx + movq %rax, U(%rsp) + movq %rbx, U+8(%rsp) + movq %rcx, U+16(%rsp) + movq %rdx, U+24(%rsp) + + movq $0x75eb4dca135978a3, %rax + movq %rax, W(%rsp) + movq $0x00700a4d4141d8ab, %rax + movq %rax, W+8(%rsp) + movq $0x8cc740797779e898, %rax + movq %rax, W+16(%rsp) + movq $0x52036cee2b6ffe73, %rax + movq %rax, W+24(%rsp) + leaq V(%rsp), %rdi + leaq W(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + movq V(%rsp), %rax + addq $1, %rax + movq V+8(%rsp), %rbx + adcq $0, %rbx + movq V+16(%rsp), %rcx + adcq $0, %rcx + movq V+24(%rsp), %rdx + adcq $0, %rdx + movq %rax, V(%rsp) + movq %rbx, V+8(%rsp) + movq %rcx, V+16(%rsp) + movq %rdx, V+24(%rsp) + + leaq W(%rsp), %rdi + leaq U(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq S(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq V(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + leaq S(%rsp), %rdi + leaq U(%rsp), %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// %rax = 0 <=> s^2 * w = 0 or 1 + + movq V(%rsp), %r8 + movq V+8(%rsp), %r9 + movq V+16(%rsp), %r10 + movq V+24(%rsp), %r11 + movl $1, %eax + notq %rax + andq %r8, %rax + orq %r9, %rax + orq %r10, %rax + orq %r11, %rax + +// %r8 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + addq $20, %r8 + notq %r9 + notq %r10 + bts $63, %r11 + addq $1, %r11 + orq %r9, %r8 + orq %r11, %r10 + orq %r10, %r8 + +// If s^2 * w is not 0 or 1 then replace s by t + + testq %rax, %rax + + movq S(%rsp), %r12 + movq T(%rsp), %rbx + cmovnzq %rbx, %r12 + movq S+8(%rsp), %r13 + movq T+8(%rsp), %rbx + cmovnzq %rbx, %r13 + movq S+16(%rsp), %r14 + movq T+16(%rsp), %rbx + cmovnzq %rbx, %r14 + movq S+24(%rsp), %r15 + movq T+24(%rsp), %rbx + cmovnzq %rbx, %r15 + movq %r12, S(%rsp) + movq %r13, S+8(%rsp) + movq %r14, S+16(%rsp) + movq %r15, S+24(%rsp) + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + cmovzq %rax, %r8 + negq %r8 + sbbq %r8, %r8 + negq %r8 + orq %r8, badun + +// Let [%r11;%r10;%r9;%r8] = s and [%r15;%r14;%r13;%r12] = p_25519 - s + + movq S(%rsp), %r8 + movq $-19, %r12 + subq %r8, %r12 + movq S+8(%rsp), %r9 + movq $-1, %r13 + sbbq %r9, %r13 + movq S+16(%rsp), %r10 + movq $-1, %r14 + sbbq %r10, %r14 + movq S+24(%rsp), %r11 + movq $0x7FFFFFFFFFFFFFFF, %r15 + sbbq %r11, %r15 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + movl $1, %ecx + andq %r8, %rcx + xorq sgnbit, %rcx + movq badun, %rdx + movq %rdx, %rsi + orq %rcx, %rdx + xorl %ebp, %ebp + movq %r8, %rax + movq %r9, %rbx + orq %r10, %rax + orq %r11, %rbx + orq %rbx, %rax + cmovzq %rbp, %rcx + cmovnzq %rsi, %rdx + +// Actual selection of x as s or -s, copying of y and return of validity + + testq %rcx, %rcx + + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq Y(%rsp), %rcx + movq %rcx, 32(%rdi) + movq Y+8(%rsp), %rcx + movq %rcx, 40(%rdi) + movq Y+16(%rsp), %rcx + movq %rcx, 48(%rdi) + movq Y+24(%rsp), %rcx + movq %rcx, 56(%rdi) + + movq %rdx, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_alt_mul_p25519: + movq %rdx, %rcx + movq (%rsi), %rax + mulq (%rcx) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsi), %rax + mulq (%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq (%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsi), %rax + mulq (%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsi), %rax + mulq (%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %esi + movq %r12, %rax + mulq %rsi + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rsi + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + leaq 0x1(%r12), %rax + movl $0x13, %esi + bts $0x3f, %r11 + imulq %rsi, %rax + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + sbbq %rax, %rax + notq %rax + andq %rsi, %rax + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rcx, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_alt_nsqr_p25519: + +// Copy input argument into q + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, Q8(%rsp) + movq %rbx, Q8+8(%rsp) + movq %rcx, Q8+16(%rsp) + movq %rdx, Q8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_alt_loop: + movq Q8(%rsp), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq Q8(%rsp), %rax + mulq Q8+0x8(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq Q8+0x8(%rsp), %rax + mulq %rax + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq Q8(%rsp), %rax + mulq Q8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq Q8(%rsp), %rax + mulq Q8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq Q8+0x8(%rsp), %rax + mulq Q8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq Q8+0x8(%rsp), %rax + mulq Q8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq Q8+0x10(%rsp), %rax + mulq %rax + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq Q8+0x10(%rsp), %rax + mulq Q8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq Q8+0x18(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %ebx + movq %r12, %rax + mulq %rbx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rbx + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + movq %r8, Q8(%rsp) + movq %r9, Q8+0x8(%rsp) + movq %r10, Q8+0x10(%rsp) + movq %r11, Q8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz edwards25519_decode_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/curve25519/edwards25519_encode.S b/x86_att/curve25519/edwards25519_encode.S new file mode 100644 index 0000000000..bdbaa47232 --- /dev/null +++ b/x86_att/curve25519/edwards25519_encode.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Encode edwards25519 point into compressed form as 256-bit number +// Input p[8]; output z[32] (bytes) +// +// extern void edwards25519_encode +// (uint8_t z[static 32], uint64_t p[static 8]); +// +// This assumes that the input buffer p points to a pair of 256-bit +// numbers x (at p) and y (at p+4) representing a point (x,y) on the +// edwards25519 curve. It is assumed that both x and y are < p_25519 +// but there is no checking of this, nor of the fact that (x,y) is +// in fact on the curve. +// +// The output in z is a little-endian array of bytes corresponding to +// the standard compressed encoding of a point as 2^255 * x_0 + y +// where x_0 is the least significant bit of x. +// See "https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2" +// In this implementation, y is simply truncated to 255 bits, but if +// it is reduced mod p_25519 as expected this does not affect values. +// +// Standard x86-64 ABI: RDI = z, RSI = p +// Microsoft x64 ABI: RCX = z, RDX = p +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_encode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_encode) + .text + +#define z %rdi +#define p %rsi +#define y0 %rax +#define y1 %rcx +#define y2 %rdx +#define y3 %r8 +#define xb %r9 + +S2N_BN_SYMBOL(edwards25519_encode): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load lowest word of x coordinate in xb and full y as [y3;y2;y1;y0]. + + movq (p), xb + movq 32(p), y0 + movq 40(p), y1 + movq 48(p), y2 + movq 56(p), y3 + +// Compute the encoded form, making the LSB of x the MSB of the encoding + + btr $63, y3 + shlq $63, xb + orq xb, y3 + +// Store back (by the word, since x86 is little-endian anyway) + + movq y0, (z) + movq y1, 8(z) + movq y2, 16(z) + movq y3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index a024c9daa4..c44e31724c 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -38,23 +38,22 @@ #define xpy_2 (2*NUMSIZE)(%rsp) #define kxy_2 (3*NUMSIZE)(%rsp) -#define acc (4*NUMSIZE)(%rsp) -#define x_1 (4*NUMSIZE)(%rsp) -#define y_1 (5*NUMSIZE)(%rsp) -#define z_1 (6*NUMSIZE)(%rsp) -#define w_1 (7*NUMSIZE)(%rsp) -#define x_3 (4*NUMSIZE)(%rsp) -#define y_3 (5*NUMSIZE)(%rsp) -#define z_3 (6*NUMSIZE)(%rsp) -#define w_3 (7*NUMSIZE)(%rsp) - -#define tmpspace (8*NUMSIZE)(%rsp) -#define t0 (8*NUMSIZE)(%rsp) -#define t1 (9*NUMSIZE)(%rsp) -#define t2 (10*NUMSIZE)(%rsp) -#define t3 (11*NUMSIZE)(%rsp) -#define t4 (12*NUMSIZE)(%rsp) -#define t5 (13*NUMSIZE)(%rsp) +#define t0 (4*NUMSIZE)(%rsp) +#define t1 (5*NUMSIZE)(%rsp) +#define t2 (6*NUMSIZE)(%rsp) +#define t3 (7*NUMSIZE)(%rsp) +#define t4 (8*NUMSIZE)(%rsp) +#define t5 (9*NUMSIZE)(%rsp) + +#define acc (10*NUMSIZE)(%rsp) +#define x_1 (10*NUMSIZE)(%rsp) +#define y_1 (11*NUMSIZE)(%rsp) +#define z_1 (12*NUMSIZE)(%rsp) +#define w_1 (13*NUMSIZE)(%rsp) +#define x_3 (10*NUMSIZE)(%rsp) +#define y_3 (11*NUMSIZE)(%rsp) +#define z_3 (12*NUMSIZE)(%rsp) +#define w_3 (13*NUMSIZE)(%rsp) // Stable homes for the input result pointer, and other variables @@ -73,6 +72,15 @@ #define NSPACE (15*NUMSIZE+8) +// Syntactic variants to make x86_att version simpler to generate + +#define SCALAR 0 +#define TABENT (1*NUMSIZE) +#define ACC (10*NUMSIZE) +#define X3 (10*NUMSIZE) +#define Z3 (12*NUMSIZE) +#define W3 (13*NUMSIZE) + // Macro wrapping up the basic field multiplication, only trivially // different from a pure function call to bignum_mul_p25519. @@ -337,12 +345,12 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase): pushq %rsi movq %rcx, %rdi movq %rdx, %rsi - callq edwards25519_scalarmulbase_curve25519_x25519base_standard + callq edwards25519_scalarmulbase_standard popq %rsi popq %rdi ret -edwards25519_scalarmulbase_curve25519_x25519base_standard: +edwards25519_scalarmulbase_standard: #endif // Save registers, make room for temps, preserve input arguments. @@ -413,11 +421,11 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // And before we store the scalar, test and reset bit 251 to // initialize the main loop just below. - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) btr $59, %r11 - movq %r11, 24(%rsp) + movq %r11, SCALAR+24(%rsp) // The main part of the computation is in extended-projective coordinates // (X,Y,Z,T), representing an affine point on the edwards25519 curve @@ -428,75 +436,75 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. - leaq edwards25519_scalarmulbase_edwards25519_0g(%rip), %r10 - leaq edwards25519_scalarmulbase_edwards25519_251g(%rip), %r11 + leaq edwards25519_scalarmulbase_0g(%rip), %r10 + leaq edwards25519_scalarmulbase_251g(%rip), %r11 movq (%r10), %rax movq (%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*16(%rsp) + movq %rax, ACC(%rsp) movq 8*1(%r10), %rax movq 8*1(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*17(%rsp) + movq %rax, ACC+8(%rsp) movq 8*2(%r10), %rax movq 8*2(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*18(%rsp) + movq %rax, ACC+16(%rsp) movq 8*3(%r10), %rax movq 8*3(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*19(%rsp) + movq %rax, ACC+24(%rsp) movq 8*4(%r10), %rax movq 8*4(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*20(%rsp) + movq %rax, ACC+32(%rsp) movq 8*5(%r10), %rax movq 8*5(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*21(%rsp) + movq %rax, ACC+40(%rsp) movq 8*6(%r10), %rax movq 8*6(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*22(%rsp) + movq %rax, ACC+48(%rsp) movq 8*7(%r10), %rax movq 8*7(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*23(%rsp) + movq %rax, ACC+56(%rsp) movl $1, %eax - movq %rax, 8*24(%rsp) + movq %rax, ACC+64(%rsp) movl $0, %eax - movq %rax, 8*25(%rsp) - movq %rax, 8*26(%rsp) - movq %rax, 8*27(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) movq 8*8(%r10), %rax movq 8*8(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*28(%rsp) + movq %rax, ACC+96(%rsp) movq 8*9(%r10), %rax movq 8*9(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*29(%rsp) + movq %rax, ACC+104(%rsp) movq 8*10(%r10), %rax movq 8*10(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*30(%rsp) + movq %rax, ACC+112(%rsp) movq 8*11(%r10), %rax movq 8*11(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*31(%rsp) + movq %rax, ACC+120(%rsp) // The counter "i" tracks the bit position for which the scalar has // already been absorbed, starting at 0 and going up in chunks of 4. @@ -512,7 +520,7 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // end because we made sure bit 251 is clear in the reduced scalar. movq $0, i - leaq edwards25519_scalarmulbase_edwards25519_gtable(%rip), %rax + leaq edwards25519_scalarmulbase_gtable(%rip), %rax movq %rax, tab movq $0, bias @@ -804,26 +812,26 @@ edwards25519_scalarmulbase_scalarloop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 32(%rsp) - movq %r8, 64(%rsp) + movq %rsi, TABENT(%rsp) + movq %r8, TABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 40(%rsp) - movq %r9, 72(%rsp) + movq %rsi, TABENT+8(%rsp) + movq %r9, TABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 48(%rsp) - movq %r10, 80(%rsp) + movq %rsi, TABENT+16(%rsp) + movq %r10, TABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 56(%rsp) - movq %r11, 88(%rsp) + movq %rsi, TABENT+24(%rsp) + movq %r11, TABENT+56(%rsp) movq $-19, %rax movq $-1, %rbx @@ -844,10 +852,10 @@ edwards25519_scalarmulbase_scalarloop: cmovzq %r13, %rbx cmovzq %r14, %rcx cmovzq %r15, %rdx - movq %rax, 96(%rsp) - movq %rbx, 104(%rsp) - movq %rcx, 112(%rsp) - movq %rdx, 120(%rsp) + movq %rax, TABENT+64(%rsp) + movq %rbx, TABENT+72(%rsp) + movq %rcx, TABENT+80(%rsp) + movq %rdx, TABENT+88(%rsp) // Extended-projective and precomputed mixed addition. // This is effectively the same as calling the standalone @@ -884,10 +892,10 @@ edwards25519_scalarmulbase_scalarloop: // point on we don't need any normalization of the coordinates // except for making sure that they fit in 4 digits. - movq 128(%rsp), %r8 - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 + movq X3(%rsp), %r8 + movq X3+8(%rsp), %r9 + movq X3+16(%rsp), %r10 + movq X3+24(%rsp), %r11 movq $0xffffffffffffffda, %r12 subq %r8, %r12 movq $0xffffffffffffffff, %r13 @@ -896,424 +904,1377 @@ edwards25519_scalarmulbase_scalarloop: sbbq %r10, %r14 movq $0xffffffffffffffff, %r15 sbbq %r11, %r15 - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax btq $63, %rax cmovcq %r12, %r8 cmovcq %r13, %r9 cmovcq %r14, %r10 cmovcq %r15, %r11 - movq %r8, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) + movq %r8, X3(%rsp) + movq %r9, X3+8(%rsp) + movq %r10, X3+16(%rsp) + movq %r11, X3+24(%rsp) // Now we need to map out of the extended-projective representation // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq edwards25519_scalarmulbase_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq W3(%rsp), %rdi + leaq Z3(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, x_3, y_3, +// z_3 and w_3. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmulbase_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmulbase_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmulbase_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp edwards25519_scalarmulbase_midloop +edwards25519_scalarmulbase_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmulbase_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_wmontend -edwards25519_scalarmulbase_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_wmontloop -edwards25519_scalarmulbase_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_wcorrloop + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_zmontend -edwards25519_scalarmulbase_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_zmontloop -edwards25519_scalarmulbase_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmulbase_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +edwards25519_scalarmulbase_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmulbase_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmulbase_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmulbase_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmulbase_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne edwards25519_scalarmulbase_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1344,18 +2305,10 @@ edwards25519_scalarmulbase_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -edwards25519_scalarmulbase_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. -edwards25519_scalarmulbase_edwards25519_0g: +edwards25519_scalarmulbase_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 @@ -1372,7 +2325,7 @@ edwards25519_scalarmulbase_edwards25519_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 -edwards25519_scalarmulbase_edwards25519_251g: +edwards25519_scalarmulbase_251g: .quad 0x525f946d7c7220e7 .quad 0x4636b0b2f1e35444 @@ -1390,7 +2343,7 @@ edwards25519_scalarmulbase_edwards25519_251g: // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. -edwards25519_scalarmulbase_edwards25519_gtable: +edwards25519_scalarmulbase_gtable: // 2^0 * 1 * G diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index e66492083f..00b91fe1aa 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -38,23 +38,22 @@ #define xpy_2 (2*NUMSIZE)(%rsp) #define kxy_2 (3*NUMSIZE)(%rsp) -#define acc (4*NUMSIZE)(%rsp) -#define x_1 (4*NUMSIZE)(%rsp) -#define y_1 (5*NUMSIZE)(%rsp) -#define z_1 (6*NUMSIZE)(%rsp) -#define w_1 (7*NUMSIZE)(%rsp) -#define x_3 (4*NUMSIZE)(%rsp) -#define y_3 (5*NUMSIZE)(%rsp) -#define z_3 (6*NUMSIZE)(%rsp) -#define w_3 (7*NUMSIZE)(%rsp) - -#define tmpspace (8*NUMSIZE)(%rsp) -#define t0 (8*NUMSIZE)(%rsp) -#define t1 (9*NUMSIZE)(%rsp) -#define t2 (10*NUMSIZE)(%rsp) -#define t3 (11*NUMSIZE)(%rsp) -#define t4 (12*NUMSIZE)(%rsp) -#define t5 (13*NUMSIZE)(%rsp) +#define t0 (4*NUMSIZE)(%rsp) +#define t1 (5*NUMSIZE)(%rsp) +#define t2 (6*NUMSIZE)(%rsp) +#define t3 (7*NUMSIZE)(%rsp) +#define t4 (8*NUMSIZE)(%rsp) +#define t5 (9*NUMSIZE)(%rsp) + +#define acc (10*NUMSIZE)(%rsp) +#define x_1 (10*NUMSIZE)(%rsp) +#define y_1 (11*NUMSIZE)(%rsp) +#define z_1 (12*NUMSIZE)(%rsp) +#define w_1 (13*NUMSIZE)(%rsp) +#define x_3 (10*NUMSIZE)(%rsp) +#define y_3 (11*NUMSIZE)(%rsp) +#define z_3 (12*NUMSIZE)(%rsp) +#define w_3 (13*NUMSIZE)(%rsp) // Stable homes for the input result pointer, and other variables @@ -73,6 +72,15 @@ #define NSPACE (15*NUMSIZE+8) +// Syntactic variants to make x86_att version simpler to generate + +#define SCALAR 0 +#define TABENT (1*NUMSIZE) +#define ACC (10*NUMSIZE) +#define X3 (10*NUMSIZE) +#define Z3 (12*NUMSIZE) +#define W3 (13*NUMSIZE) + // Macro wrapping up the basic field multiplication, only trivially // different from a pure function call to bignum_mul_p25519_alt. @@ -413,12 +421,12 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): pushq %rsi movq %rcx, %rdi movq %rdx, %rsi - callq edwards25519_scalarmulbase_alt_curve25519_x25519base_standard + callq edwards25519_scalarmulbase_alt_standard popq %rsi popq %rdi ret -edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: +edwards25519_scalarmulbase_alt_standard: #endif // Save registers, make room for temps, preserve input arguments. @@ -489,11 +497,11 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // And before we store the scalar, test and reset bit 251 to // initialize the main loop just below. - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) btr $59, %r11 - movq %r11, 24(%rsp) + movq %r11, SCALAR+24(%rsp) // The main part of the computation is in extended-projective coordinates // (X,Y,Z,T), representing an affine point on the edwards25519 curve @@ -504,75 +512,75 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. - leaq edwards25519_scalarmulbase_alt_edwards25519_0g(%rip), %r10 - leaq edwards25519_scalarmulbase_alt_edwards25519_251g(%rip), %r11 + leaq edwards25519_scalarmulbase_alt_0g(%rip), %r10 + leaq edwards25519_scalarmulbase_alt_251g(%rip), %r11 movq (%r10), %rax movq (%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*16(%rsp) + movq %rax, ACC(%rsp) movq 8*1(%r10), %rax movq 8*1(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*17(%rsp) + movq %rax, ACC+8(%rsp) movq 8*2(%r10), %rax movq 8*2(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*18(%rsp) + movq %rax, ACC+16(%rsp) movq 8*3(%r10), %rax movq 8*3(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*19(%rsp) + movq %rax, ACC+24(%rsp) movq 8*4(%r10), %rax movq 8*4(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*20(%rsp) + movq %rax, ACC+32(%rsp) movq 8*5(%r10), %rax movq 8*5(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*21(%rsp) + movq %rax, ACC+40(%rsp) movq 8*6(%r10), %rax movq 8*6(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*22(%rsp) + movq %rax, ACC+48(%rsp) movq 8*7(%r10), %rax movq 8*7(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*23(%rsp) + movq %rax, ACC+56(%rsp) movl $1, %eax - movq %rax, 8*24(%rsp) + movq %rax, ACC+64(%rsp) movl $0, %eax - movq %rax, 8*25(%rsp) - movq %rax, 8*26(%rsp) - movq %rax, 8*27(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) movq 8*8(%r10), %rax movq 8*8(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*28(%rsp) + movq %rax, ACC+96(%rsp) movq 8*9(%r10), %rax movq 8*9(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*29(%rsp) + movq %rax, ACC+104(%rsp) movq 8*10(%r10), %rax movq 8*10(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*30(%rsp) + movq %rax, ACC+112(%rsp) movq 8*11(%r10), %rax movq 8*11(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*31(%rsp) + movq %rax, ACC+120(%rsp) // The counter "i" tracks the bit position for which the scalar has // already been absorbed, starting at 0 and going up in chunks of 4. @@ -588,7 +596,7 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // end because we made sure bit 251 is clear in the reduced scalar. movq $0, i - leaq edwards25519_scalarmulbase_alt_edwards25519_gtable(%rip), %rax + leaq edwards25519_scalarmulbase_alt_gtable(%rip), %rax movq %rax, tab movq $0, bias @@ -880,26 +888,26 @@ edwards25519_scalarmulbase_alt_scalarloop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 32(%rsp) - movq %r8, 64(%rsp) + movq %rsi, TABENT(%rsp) + movq %r8, TABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 40(%rsp) - movq %r9, 72(%rsp) + movq %rsi, TABENT+8(%rsp) + movq %r9, TABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 48(%rsp) - movq %r10, 80(%rsp) + movq %rsi, TABENT+16(%rsp) + movq %r10, TABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 56(%rsp) - movq %r11, 88(%rsp) + movq %rsi, TABENT+24(%rsp) + movq %r11, TABENT+56(%rsp) movq $-19, %rax movq $-1, %rbx @@ -920,10 +928,10 @@ edwards25519_scalarmulbase_alt_scalarloop: cmovzq %r13, %rbx cmovzq %r14, %rcx cmovzq %r15, %rdx - movq %rax, 96(%rsp) - movq %rbx, 104(%rsp) - movq %rcx, 112(%rsp) - movq %rdx, 120(%rsp) + movq %rax, TABENT+64(%rsp) + movq %rbx, TABENT+72(%rsp) + movq %rcx, TABENT+80(%rsp) + movq %rdx, TABENT+88(%rsp) // Extended-projective and precomputed mixed addition. // This is effectively the same as calling the standalone @@ -960,10 +968,10 @@ edwards25519_scalarmulbase_alt_scalarloop: // point on we don't need any normalization of the coordinates // except for making sure that they fit in 4 digits. - movq 128(%rsp), %r8 - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 + movq X3(%rsp), %r8 + movq X3+8(%rsp), %r9 + movq X3+16(%rsp), %r10 + movq X3+24(%rsp), %r11 movq $0xffffffffffffffda, %r12 subq %r8, %r12 movq $0xffffffffffffffff, %r13 @@ -972,424 +980,1377 @@ edwards25519_scalarmulbase_alt_scalarloop: sbbq %r10, %r14 movq $0xffffffffffffffff, %r15 sbbq %r11, %r15 - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax btq $63, %rax cmovcq %r12, %r8 cmovcq %r13, %r9 cmovcq %r14, %r10 cmovcq %r15, %r11 - movq %r8, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) + movq %r8, X3(%rsp) + movq %r9, X3+8(%rsp) + movq %r10, X3+16(%rsp) + movq %r11, X3+24(%rsp) // Now we need to map out of the extended-projective representation // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq edwards25519_scalarmulbase_alt_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq W3(%rsp), %rdi + leaq Z3(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, x_3, y_3, +// z_3 and w_3. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmulbase_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmulbase_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp edwards25519_scalarmulbase_alt_midloop +edwards25519_scalarmulbase_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmulbase_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_alt_wmontend -edwards25519_scalarmulbase_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_wmontloop -edwards25519_scalarmulbase_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_wcorrloop + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_alt_zmontend -edwards25519_scalarmulbase_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_zmontloop -edwards25519_scalarmulbase_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmulbase_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +edwards25519_scalarmulbase_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmulbase_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmulbase_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmulbase_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmulbase_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne edwards25519_scalarmulbase_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1420,18 +2381,10 @@ edwards25519_scalarmulbase_alt_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -edwards25519_scalarmulbase_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. -edwards25519_scalarmulbase_alt_edwards25519_0g: +edwards25519_scalarmulbase_alt_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 @@ -1448,7 +2401,7 @@ edwards25519_scalarmulbase_alt_edwards25519_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 -edwards25519_scalarmulbase_alt_edwards25519_251g: +edwards25519_scalarmulbase_alt_251g: .quad 0x525f946d7c7220e7 .quad 0x4636b0b2f1e35444 @@ -1466,7 +2419,7 @@ edwards25519_scalarmulbase_alt_edwards25519_251g: // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. -edwards25519_scalarmulbase_alt_edwards25519_gtable: +edwards25519_scalarmulbase_alt_gtable: // 2^0 * 1 * G diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S new file mode 100644 index 0000000000..35fd7f4ffc --- /dev/null +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -0,0 +1,3619 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Double scalar multiplication for edwards25519, fresh and base point +// Input scalar[4], point[8], bscalar[4]; output res[8] +// +// extern void edwards25519_scalarmuldouble +// (uint64_t res[static 8],uint64_t scalar[static 4], +// uint64_t point[static 8],uint64_t bscalar[static 4]); +// +// Given scalar = n, point = P and bscalar = m, returns in res +// the point (X,Y) = n * P + m * B where B = (...,4/5) is +// the standard basepoint for the edwards25519 (Ed25519) curve. +// +// Both 256-bit coordinates of the input point P are implicitly +// reduced modulo 2^255-19 if they are not already in reduced form, +// but the conventional usage is that they *are* already reduced. +// The scalars can be arbitrary 256-bit numbers but may also be +// considered as implicitly reduced modulo the group order. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point, RCX = bscalar +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point, R9 = bscalar +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_scalarmuldouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_scalarmuldouble) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for result and temporaries on stack with some aliasing. +// Both "resx" and "resy" assume the "res" pointer has been preloaded into %rbp. + +#define resx (0*NUMSIZE)(%rbp) +#define resy (1*NUMSIZE)(%rbp) + +#define scalar (0*NUMSIZE)(%rsp) +#define bscalar (1*NUMSIZE)(%rsp) + +#define tabent (2*NUMSIZE)(%rsp) +#define btabent (6*NUMSIZE)(%rsp) + +#define acc (9*NUMSIZE)(%rsp) + +#define tab (13*NUMSIZE)(%rsp) + +// Additional variables kept on the stack + +#define bf 45*NUMSIZE(%rsp) +#define cf 45*NUMSIZE+8(%rsp) +#define i 45*NUMSIZE+16(%rsp) +#define res 45*NUMSIZE+24(%rsp) + +// Total size to reserve on the stack (excluding local subroutines) + +#define NSPACE (46*NUMSIZE) + +// Syntactic variants to make x86_att forms easier to generate + +#define SCALAR (0*NUMSIZE) +#define BSCALAR (1*NUMSIZE) +#define TABENT (2*NUMSIZE) +#define BTABENT (6*NUMSIZE) +#define ACC (9*NUMSIZE) +#define TAB (13*NUMSIZE) + +// Sub-references used in local subroutines with local stack + +#define x_0 0(%rdi) +#define y_0 NUMSIZE(%rdi) +#define z_0 (2*NUMSIZE)(%rdi) +#define w_0 (3*NUMSIZE)(%rdi) + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) +#define w_1 (3*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) +#define w_2 (3*NUMSIZE)(%rbp) + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) +#define t5 (5*NUMSIZE)(%rsp) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519. + +#define mul_p25519(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define double_twice4(P0,P1) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq %r8, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq %r9, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq %r10, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq %r11, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movq $0xebd69b9426b2f159, %rax ; \ + movq %rax, P0 ; \ + movq $0x00e0149a8283b156, %rax ; \ + movq %rax, 8+P0 ; \ + movq $0x198e80f2eef3d130, %rax ; \ + movq %rax, 16+P0 ; \ + movq $0x2406d9dc56dffce7, %rax ; \ + movq %rax, 24+P0 + +S2N_BN_SYMBOL(edwards25519_scalarmuldouble): + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from keeping code and data together. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + callq edwards25519_scalarmuldouble_standard + popq %rsi + popq %rdi + ret + +edwards25519_scalarmuldouble_standard: +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $NSPACE, %rsp + +// Move the output pointer to a stable place + + movq %rdi, res + +// Copy scalars while recoding all 4-bit nybbles except the top +// one (bits 252..255) into signed 4-bit digits. This is essentially +// done just by adding the recoding constant 0x0888..888, after +// which all digits except the first have an implicit bias of -8, +// so 0 -> -8, 1 -> -7, ... 7 -> -1, 8 -> 0, 9 -> 1, ... 15 -> 7. +// (We could literally create 2s complement signed nybbles by +// XORing with the same constant 0x0888..888 afterwards, but it +// doesn't seem to make the end usage any simpler.) +// +// In order to ensure that the unrecoded top nybble (bits 252..255) +// does not become > 8 as a result of carries lower down from the +// recoding, we first (conceptually) subtract the group order iff +// the top digit of the scalar is > 2^63. In the implementation the +// reduction and recoding are combined by optionally using the +// modified recoding constant 0x0888...888 + (2^256 - group_order). + + movq (%rcx), %r8 + movq 8(%rcx), %r9 + movq 16(%rcx), %r10 + movq 24(%rcx), %r11 + movq $0xc7f56fb5a0d9e920, %r12 + movq $0xe190b99370cba1d5, %r13 + movq $0x8888888888888887, %r14 + movq $0x8888888888888888, %r15 + movq $0x8000000000000000, %rax + movq $0x0888888888888888, %rbx + cmpq %r11, %rax + cmovncq %r15, %r12 + cmovncq %r15, %r13 + cmovncq %r15, %r14 + cmovncq %rbx, %r15 + addq %r12, %r8 + adcq %r13, %r9 + adcq %r14, %r10 + adcq %r15, %r11 + movq %r8, BSCALAR(%rsp) + movq %r9, BSCALAR+8(%rsp) + movq %r10, BSCALAR+16(%rsp) + movq %r11, BSCALAR+24(%rsp) + + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + movq $0xc7f56fb5a0d9e920, %r12 + movq $0xe190b99370cba1d5, %r13 + movq $0x8888888888888887, %r14 + movq $0x8888888888888888, %r15 + movq $0x8000000000000000, %rax + movq $0x0888888888888888, %rbx + cmpq %r11, %rax + cmovncq %r15, %r12 + cmovncq %r15, %r13 + cmovncq %r15, %r14 + cmovncq %rbx, %r15 + addq %r12, %r8 + adcq %r13, %r9 + adcq %r14, %r10 + adcq %r15, %r11 + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) + movq %r11, SCALAR+24(%rsp) + +// Create table of multiples 1..8 of the general input point at "tab". +// Reduce the input coordinates x and y modulo 2^256 - 38 first, for the +// sake of definiteness; this is the reduction that will be maintained. +// We could slightly optimize the additions because we know the input +// point is affine (so Z = 1), but it doesn't seem worth the complication. + + movl $38, %eax + movq (%rdx), %r8 + xorl %ebx, %ebx + movq 8(%rdx), %r9 + xorl %ecx, %ecx + movq 16(%rdx), %r10 + xorl %esi, %esi + movq 24(%rdx), %r11 + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rsi + cmovncq %r8, %rax + movq %rax, TAB(%rsp) + cmovncq %r9, %rbx + movq %rbx, TAB+8(%rsp) + cmovncq %r10, %rcx + movq %rcx, TAB+16(%rsp) + cmovncq %r11, %rsi + movq %rsi, TAB+24(%rsp) + + movl $38, %eax + movq 32(%rdx), %r8 + xorl %ebx, %ebx + movq 40(%rdx), %r9 + xorl %ecx, %ecx + movq 48(%rdx), %r10 + xorl %esi, %esi + movq 56(%rdx), %r11 + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rsi + cmovncq %r8, %rax + movq %rax, TAB+32(%rsp) + cmovncq %r9, %rbx + movq %rbx, TAB+40(%rsp) + cmovncq %r10, %rcx + movq %rcx, TAB+48(%rsp) + cmovncq %r11, %rsi + movq %rsi, TAB+56(%rsp) + + movl $1, %eax + movq %rax, TAB+64(%rsp) + xorl %eax, %eax + movq %rax, TAB+72(%rsp) + movq %rax, TAB+80(%rsp) + movq %rax, TAB+88(%rsp) + + leaq TAB+96(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+32(%rsp), %rbp + mul_4(x_0,x_1,x_2) + +// Multiple 2 + + leaq TAB+1*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq edwards25519_scalarmuldouble_epdouble + +// Multiple 3 + + leaq TAB+2*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+1*128(%rsp), %rbp + callq edwards25519_scalarmuldouble_epadd + +// Multiple 4 + + leaq TAB+3*128(%rsp), %rdi + leaq TAB+1*128(%rsp), %rsi + callq edwards25519_scalarmuldouble_epdouble + +// Multiple 5 + + leaq TAB+4*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+3*128(%rsp), %rbp + callq edwards25519_scalarmuldouble_epadd + +// Multiple 6 + + leaq TAB+5*128(%rsp), %rdi + leaq TAB+2*128(%rsp), %rsi + callq edwards25519_scalarmuldouble_epdouble + +// Multiple 7 + + leaq TAB+6*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+5*128(%rsp), %rbp + callq edwards25519_scalarmuldouble_epadd + +// Multiple 8 + + leaq TAB+7*128(%rsp), %rdi + leaq TAB+3*128(%rsp), %rsi + callq edwards25519_scalarmuldouble_epdouble + +// Handle the initialization, starting the loop counter at i = 252 +// and initializing acc to the sum of the table entries for the +// top nybbles of the scalars (the ones with no implicit -8 bias). + + movq $252, %rax + movq %rax, i + +// Index for btable entry... + + movq BSCALAR+24(%rsp), %rax + shrq $60, %rax + movq %rax, bf + +// ...and constant-time indexing based on that index + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + leaq edwards25519_scalarmuldouble_table(%rip), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + + movq %rax, BTABENT(%rsp) + movq %rbx, BTABENT+8(%rsp) + movq %rcx, BTABENT+16(%rsp) + movq %rdx, BTABENT+24(%rsp) + movq %r8, BTABENT+32(%rsp) + movq %r9, BTABENT+40(%rsp) + movq %r10, BTABENT+48(%rsp) + movq %r11, BTABENT+56(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) + +// Index for table entry... + + movq SCALAR+24(%rsp), %rax + shrq $60, %rax + movq %rax, bf + +// ...and constant-time indexing based on that index. +// Do the Y and Z fields first, to save on registers... + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + leaq TAB+32(%rsp), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) + +// ...followed by the X and W fields + + leaq TAB(%rsp), %rbp + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) + +// Add those elements to initialize the accumulator for bit position 252 + + leaq ACC(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp + callq edwards25519_scalarmuldouble_pepadd + +// Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint +// Start with i = 252 for bits 248..251 and go down four at a time to 3..0 + +edwards25519_scalarmuldouble_loop: + + movq i, %rax + subq $4, %rax + movq %rax, i + +// Double to acc' = 2 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_pdouble + +// Get btable entry, first getting the adjusted bitfield... + + movq i, %rax + movq %rax, %rcx + shrq $6, %rax + movq 32(%rsp,%rax,8), %rax + shrq %cl, %rax + andq $15, %rax + + subq $8, %rax + sbbq %rcx, %rcx + xorq %rcx, %rax + subq %rcx, %rax + movq %rcx, cf + movq %rax, bf + +// ... then doing constant-time lookup with the appropriate index... + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + leaq edwards25519_scalarmuldouble_table(%rip), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + +// ... then optionally negating before storing. The table entry +// is in precomputed form and we currently have +// +// [%rdx;%rcx;%rbx;%rax] = y - x +// [%r11;%r10;%r9;%r8] = x + y +// [%r15;%r14;%r13;%r12] = 2 * d * x * y +// +// Negation for Edwards curves is -(x,y) = (-x,y), which in this modified +// form amounts to swapping the first two fields and negating the third. +// The negation does not always fully reduce even mod 2^256-38 in the zero +// case, instead giving -0 = 2^256-38. But that is fine since the result is +// always fed to a multipliction inside the "pepadd" function below that +// handles any 256-bit input. + + movq cf, %rdi + testq %rdi, %rdi + + movq %rax, %rsi + cmovnzq %r8, %rsi + cmovnzq %rax, %r8 + movq %rsi, BTABENT(%rsp) + movq %r8, BTABENT+32(%rsp) + + movq %rbx, %rsi + cmovnzq %r9, %rsi + cmovnzq %rbx, %r9 + movq %rsi, BTABENT+8(%rsp) + movq %r9, BTABENT+40(%rsp) + + movq %rcx, %rsi + cmovnzq %r10, %rsi + cmovnzq %rcx, %r10 + movq %rsi, BTABENT+16(%rsp) + movq %r10, BTABENT+48(%rsp) + + movq %rdx, %rsi + cmovnzq %r11, %rsi + cmovnzq %rdx, %r11 + movq %rsi, BTABENT+24(%rsp) + movq %r11, BTABENT+56(%rsp) + + xorq %rdi, %r12 + xorq %rdi, %r13 + xorq %rdi, %r14 + xorq %rdi, %r15 + andq $37, %rdi + subq %rdi, %r12 + sbbq $0, %r13 + sbbq $0, %r14 + sbbq $0, %r15 + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) + +// Get table entry, first getting the adjusted bitfield... + + movq i, %rax + movq %rax, %rcx + shrq $6, %rax + movq (%rsp,%rax,8), %rax + shrq %cl, %rax + andq $15, %rax + + subq $8, %rax + sbbq %rcx, %rcx + xorq %rcx, %rax + subq %rcx, %rax + movq %rcx, cf + movq %rax, bf + +// ...and constant-time indexing based on that index +// Do the Y and Z fields first, to save on registers +// and store them back (they don't need any modification) + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + leaq TAB+32(%rsp), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) + +// Now do the X and W fields... + + leaq TAB(%rsp), %rbp + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + +// ... then optionally negate before storing the X and W fields. This +// time the table entry is extended-projective, and is here: +// +// [%rdx;%rcx;%rbx;%rax] = X +// [tabent+32] = Y +// [tabent+64] = Z +// [%r11;%r10;%r9;%r8] = W +// +// This time we just need to negate the X and the W fields. +// The crude way negation is done can result in values of X or W +// (when initially zero before negation) being exactly equal to +// 2^256-38, but the "pepadd" function handles that correctly. + + movq cf, %rdi + + xorq %rdi, %rax + xorq %rdi, %rbx + xorq %rdi, %rcx + xorq %rdi, %rdx + + xorq %rdi, %r8 + xorq %rdi, %r9 + xorq %rdi, %r10 + xorq %rdi, %r11 + + andq $37, %rdi + + subq %rdi, %rax + sbbq $0, %rbx + sbbq $0, %rcx + sbbq $0, %rdx + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + subq %rdi, %r8 + sbbq $0, %r9 + sbbq $0, %r10 + sbbq $0, %r11 + + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) + +// Double to acc' = 4 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_pdouble + +// Add tabent := tabent + btabent + + leaq TABENT(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp + callq edwards25519_scalarmuldouble_pepadd + +// Double to acc' = 8 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_pdouble + +// Double to acc' = 16 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_epdouble + +// Add table entry, acc := acc + tabent + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rbp + callq edwards25519_scalarmuldouble_epadd + +// Loop down + + movq i, %rax + testq %rax, %rax + jnz edwards25519_scalarmuldouble_loop + +// Prepare to call the modular inverse function to get tab = 1/z + + leaq TAB(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, tab and acc. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) + xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) + xorl %ebx, %ebx + movq (%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx + movq 0x8(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) + movq 0x18(%rsp), %rax + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx + movq 0x40(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + +// Store result + + movq res, %rdi + leaq ACC(%rsp), %rsi + leaq TAB(%rsp), %rbp + mul_p25519(x_0,x_1,x_2) + + movq res, %rdi + addq $32, %rdi + leaq ACC+32(%rsp), %rsi + leaq TAB(%rsp), %rbp + mul_p25519(x_0,x_1,x_2) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// **************************************************************************** +// Localized versions of subroutines. +// These are close to the standalone functions "edwards25519_epdouble" etc., +// but are only maintaining reduction modulo 2^256 - 38, not 2^255 - 19. +// **************************************************************************** + +edwards25519_scalarmuldouble_epdouble: + sub $(5*NUMSIZE), %rsp + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(w_0,t1,t4) + mul_4(x_0,t1,t3) + add $(5*NUMSIZE), %rsp + ret + +edwards25519_scalarmuldouble_pdouble: + sub $(5*NUMSIZE), %rsp + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(x_0,t1,t3) + add $(5*NUMSIZE), %rsp + ret + +edwards25519_scalarmuldouble_epadd: + sub $(6*NUMSIZE), %rsp + mul_4(t0,w_1,w_2) + sub_twice4(t1,y_1,x_1) + sub_twice4(t2,y_2,x_2) + add_twice4(t3,y_1,x_1) + add_twice4(t4,y_2,x_2) + double_twice4(t5,z_2) + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + load_k25519(t2) + mul_4(t2,t2,t0) + mul_4(t4,z_1,t5) + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + mul_4(w_0,t0,t5) + mul_4(x_0,t0,t1) + mul_4(y_0,t3,t5) + mul_4(z_0,t1,t3) + add $(6*NUMSIZE), %rsp + ret + +edwards25519_scalarmuldouble_pepadd: + sub $(6*NUMSIZE), %rsp + double_twice4(t0,z_1); + sub_twice4(t1,y_1,x_1); + add_twice4(t2,y_1,x_1); + mul_4(t3,w_1,z_2); + mul_4(t1,t1,x_2); + mul_4(t2,t2,y_2); + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + mul_4(z_0,t4,t0); + mul_4(x_0,t5,t4); + mul_4(y_0,t0,t1); + mul_4(w_0,t5,t1); + add $(6*NUMSIZE), %rsp + ret + +// **************************************************************************** +// The precomputed data (all read-only). This is currently part of the same +// text section, which gives position-independent code with simple PC-relative +// addressing. However it could be put in a separate section via something like +// +// .section .rodata +// **************************************************************************** + +// Precomputed table of multiples of generator for edwards25519 +// all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. + +edwards25519_scalarmuldouble_table: + + // 1 * G + + .quad 0x9d103905d740913e + .quad 0xfd399f05d140beb3 + .quad 0xa5c18434688f8a09 + .quad 0x44fd2f9298f81267 + .quad 0x2fbc93c6f58c3b85 + .quad 0xcf932dc6fb8c0e19 + .quad 0x270b4898643d42c2 + .quad 0x07cf9d3a33d4ba65 + .quad 0xabc91205877aaa68 + .quad 0x26d9e823ccaac49e + .quad 0x5a1b7dcbdd43598c + .quad 0x6f117b689f0c65a8 + + // 2 * G + + .quad 0x8a99a56042b4d5a8 + .quad 0x8f2b810c4e60acf6 + .quad 0xe09e236bb16e37aa + .quad 0x6bb595a669c92555 + .quad 0x9224e7fc933c71d7 + .quad 0x9f469d967a0ff5b5 + .quad 0x5aa69a65e1d60702 + .quad 0x590c063fa87d2e2e + .quad 0x43faa8b3a59b7a5f + .quad 0x36c16bdd5d9acf78 + .quad 0x500fa0840b3d6a31 + .quad 0x701af5b13ea50b73 + + // 3 * G + + .quad 0x56611fe8a4fcd265 + .quad 0x3bd353fde5c1ba7d + .quad 0x8131f31a214bd6bd + .quad 0x2ab91587555bda62 + .quad 0xaf25b0a84cee9730 + .quad 0x025a8430e8864b8a + .quad 0xc11b50029f016732 + .quad 0x7a164e1b9a80f8f4 + .quad 0x14ae933f0dd0d889 + .quad 0x589423221c35da62 + .quad 0xd170e5458cf2db4c + .quad 0x5a2826af12b9b4c6 + + // 4 * G + + .quad 0x95fe050a056818bf + .quad 0x327e89715660faa9 + .quad 0xc3e8e3cd06a05073 + .quad 0x27933f4c7445a49a + .quad 0x287351b98efc099f + .quad 0x6765c6f47dfd2538 + .quad 0xca348d3dfb0a9265 + .quad 0x680e910321e58727 + .quad 0x5a13fbe9c476ff09 + .quad 0x6e9e39457b5cc172 + .quad 0x5ddbdcf9102b4494 + .quad 0x7f9d0cbf63553e2b + + // 5 * G + + .quad 0x7f9182c3a447d6ba + .quad 0xd50014d14b2729b7 + .quad 0xe33cf11cb864a087 + .quad 0x154a7e73eb1b55f3 + .quad 0xa212bc4408a5bb33 + .quad 0x8d5048c3c75eed02 + .quad 0xdd1beb0c5abfec44 + .quad 0x2945ccf146e206eb + .quad 0xbcbbdbf1812a8285 + .quad 0x270e0807d0bdd1fc + .quad 0xb41b670b1bbda72d + .quad 0x43aabe696b3bb69a + + // 6 * G + + .quad 0x499806b67b7d8ca4 + .quad 0x575be28427d22739 + .quad 0xbb085ce7204553b9 + .quad 0x38b64c41ae417884 + .quad 0x3a0ceeeb77157131 + .quad 0x9b27158900c8af88 + .quad 0x8065b668da59a736 + .quad 0x51e57bb6a2cc38bd + .quad 0x85ac326702ea4b71 + .quad 0xbe70e00341a1bb01 + .quad 0x53e4a24b083bc144 + .quad 0x10b8e91a9f0d61e3 + + // 7 * G + + .quad 0xba6f2c9aaa3221b1 + .quad 0x6ca021533bba23a7 + .quad 0x9dea764f92192c3a + .quad 0x1d6edd5d2e5317e0 + .quad 0x6b1a5cd0944ea3bf + .quad 0x7470353ab39dc0d2 + .quad 0x71b2528228542e49 + .quad 0x461bea69283c927e + .quad 0xf1836dc801b8b3a2 + .quad 0xb3035f47053ea49a + .quad 0x529c41ba5877adf3 + .quad 0x7a9fbb1c6a0f90a7 + + // 8 * G + + .quad 0xe2a75dedf39234d9 + .quad 0x963d7680e1b558f9 + .quad 0x2c2741ac6e3c23fb + .quad 0x3a9024a1320e01c3 + .quad 0x59b7596604dd3e8f + .quad 0x6cb30377e288702c + .quad 0xb1339c665ed9c323 + .quad 0x0915e76061bce52f + .quad 0xe7c1f5d9c9a2911a + .quad 0xb8a371788bcca7d7 + .quad 0x636412190eb62a32 + .quad 0x26907c5c2ecc4e95 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S new file mode 100644 index 0000000000..e17d10b47a --- /dev/null +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -0,0 +1,3736 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Double scalar multiplication for edwards25519, fresh and base point +// Input scalar[4], point[8], bscalar[4]; output res[8] +// +// extern void edwards25519_scalarmuldouble_alt +// (uint64_t res[static 8],uint64_t scalar[static 4], +// uint64_t point[static 8],uint64_t bscalar[static 4]); +// +// Given scalar = n, point = P and bscalar = m, returns in res +// the point (X,Y) = n * P + m * B where B = (...,4/5) is +// the standard basepoint for the edwards25519 (Ed25519) curve. +// +// Both 256-bit coordinates of the input point P are implicitly +// reduced modulo 2^255-19 if they are not already in reduced form, +// but the conventional usage is that they *are* already reduced. +// The scalars can be arbitrary 256-bit numbers but may also be +// considered as implicitly reduced modulo the group order. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point, RCX = bscalar +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point, R9 = bscalar +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_scalarmuldouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_scalarmuldouble_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for result and temporaries on stack with some aliasing. +// Both "resx" and "resy" assume the "res" pointer has been preloaded into %rbp. + +#define resx (0*NUMSIZE)(%rbp) +#define resy (1*NUMSIZE)(%rbp) + +#define scalar (0*NUMSIZE)(%rsp) +#define bscalar (1*NUMSIZE)(%rsp) + +#define tabent (2*NUMSIZE)(%rsp) +#define btabent (6*NUMSIZE)(%rsp) + +#define acc (9*NUMSIZE)(%rsp) + +#define tab (13*NUMSIZE)(%rsp) + +// Additional variables kept on the stack + +#define bf 45*NUMSIZE(%rsp) +#define cf 45*NUMSIZE+8(%rsp) +#define i 45*NUMSIZE+16(%rsp) +#define res 45*NUMSIZE+24(%rsp) + +// Total size to reserve on the stack (excluding local subroutines) + +#define NSPACE (46*NUMSIZE) + +// Syntactic variants to make x86_att forms easier to generate + +#define SCALAR (0*NUMSIZE) +#define BSCALAR (1*NUMSIZE) +#define TABENT (2*NUMSIZE) +#define BTABENT (6*NUMSIZE) +#define ACC (9*NUMSIZE) +#define TAB (13*NUMSIZE) + +// Sub-references used in local subroutines with local stack + +#define x_0 0(%rdi) +#define y_0 NUMSIZE(%rdi) +#define z_0 (2*NUMSIZE)(%rdi) +#define w_0 (3*NUMSIZE)(%rdi) + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) +#define w_1 (3*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) +#define w_2 (3*NUMSIZE)(%rbp) + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) +#define t5 (5*NUMSIZE)(%rsp) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519_alt. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %ebx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %ebx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define double_twice4(P0,P1) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq %r8, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq %r9, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq %r10, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq %r11, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movq $0xebd69b9426b2f159, %rax ; \ + movq %rax, P0 ; \ + movq $0x00e0149a8283b156, %rax ; \ + movq %rax, 8+P0 ; \ + movq $0x198e80f2eef3d130, %rax ; \ + movq %rax, 16+P0 ; \ + movq $0x2406d9dc56dffce7, %rax ; \ + movq %rax, 24+P0 + +S2N_BN_SYMBOL(edwards25519_scalarmuldouble_alt): + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from keeping code and data together. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + callq edwards25519_scalarmuldouble_alt_standard + popq %rsi + popq %rdi + ret + +edwards25519_scalarmuldouble_alt_standard: +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $NSPACE, %rsp + +// Move the output pointer to a stable place + + movq %rdi, res + +// Copy scalars while recoding all 4-bit nybbles except the top +// one (bits 252..255) into signed 4-bit digits. This is essentially +// done just by adding the recoding constant 0x0888..888, after +// which all digits except the first have an implicit bias of -8, +// so 0 -> -8, 1 -> -7, ... 7 -> -1, 8 -> 0, 9 -> 1, ... 15 -> 7. +// (We could literally create 2s complement signed nybbles by +// XORing with the same constant 0x0888..888 afterwards, but it +// doesn't seem to make the end usage any simpler.) +// +// In order to ensure that the unrecoded top nybble (bits 252..255) +// does not become > 8 as a result of carries lower down from the +// recoding, we first (conceptually) subtract the group order iff +// the top digit of the scalar is > 2^63. In the implementation the +// reduction and recoding are combined by optionally using the +// modified recoding constant 0x0888...888 + (2^256 - group_order). + + movq (%rcx), %r8 + movq 8(%rcx), %r9 + movq 16(%rcx), %r10 + movq 24(%rcx), %r11 + movq $0xc7f56fb5a0d9e920, %r12 + movq $0xe190b99370cba1d5, %r13 + movq $0x8888888888888887, %r14 + movq $0x8888888888888888, %r15 + movq $0x8000000000000000, %rax + movq $0x0888888888888888, %rbx + cmpq %r11, %rax + cmovncq %r15, %r12 + cmovncq %r15, %r13 + cmovncq %r15, %r14 + cmovncq %rbx, %r15 + addq %r12, %r8 + adcq %r13, %r9 + adcq %r14, %r10 + adcq %r15, %r11 + movq %r8, BSCALAR(%rsp) + movq %r9, BSCALAR+8(%rsp) + movq %r10, BSCALAR+16(%rsp) + movq %r11, BSCALAR+24(%rsp) + + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + movq $0xc7f56fb5a0d9e920, %r12 + movq $0xe190b99370cba1d5, %r13 + movq $0x8888888888888887, %r14 + movq $0x8888888888888888, %r15 + movq $0x8000000000000000, %rax + movq $0x0888888888888888, %rbx + cmpq %r11, %rax + cmovncq %r15, %r12 + cmovncq %r15, %r13 + cmovncq %r15, %r14 + cmovncq %rbx, %r15 + addq %r12, %r8 + adcq %r13, %r9 + adcq %r14, %r10 + adcq %r15, %r11 + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) + movq %r11, SCALAR+24(%rsp) + +// Create table of multiples 1..8 of the general input point at "tab". +// Reduce the input coordinates x and y modulo 2^256 - 38 first, for the +// sake of definiteness; this is the reduction that will be maintained. +// We could slightly optimize the additions because we know the input +// point is affine (so Z = 1), but it doesn't seem worth the complication. + + movl $38, %eax + movq (%rdx), %r8 + xorl %ebx, %ebx + movq 8(%rdx), %r9 + xorl %ecx, %ecx + movq 16(%rdx), %r10 + xorl %esi, %esi + movq 24(%rdx), %r11 + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rsi + cmovncq %r8, %rax + movq %rax, TAB(%rsp) + cmovncq %r9, %rbx + movq %rbx, TAB+8(%rsp) + cmovncq %r10, %rcx + movq %rcx, TAB+16(%rsp) + cmovncq %r11, %rsi + movq %rsi, TAB+24(%rsp) + + movl $38, %eax + movq 32(%rdx), %r8 + xorl %ebx, %ebx + movq 40(%rdx), %r9 + xorl %ecx, %ecx + movq 48(%rdx), %r10 + xorl %esi, %esi + movq 56(%rdx), %r11 + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rsi + cmovncq %r8, %rax + movq %rax, TAB+32(%rsp) + cmovncq %r9, %rbx + movq %rbx, TAB+40(%rsp) + cmovncq %r10, %rcx + movq %rcx, TAB+48(%rsp) + cmovncq %r11, %rsi + movq %rsi, TAB+56(%rsp) + + movl $1, %eax + movq %rax, TAB+64(%rsp) + xorl %eax, %eax + movq %rax, TAB+72(%rsp) + movq %rax, TAB+80(%rsp) + movq %rax, TAB+88(%rsp) + + leaq TAB+96(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+32(%rsp), %rbp + mul_4(x_0,x_1,x_2) + +// Multiple 2 + + leaq TAB+1*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_epdouble + +// Multiple 3 + + leaq TAB+2*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+1*128(%rsp), %rbp + callq edwards25519_scalarmuldouble_alt_epadd + +// Multiple 4 + + leaq TAB+3*128(%rsp), %rdi + leaq TAB+1*128(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_epdouble + +// Multiple 5 + + leaq TAB+4*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+3*128(%rsp), %rbp + callq edwards25519_scalarmuldouble_alt_epadd + +// Multiple 6 + + leaq TAB+5*128(%rsp), %rdi + leaq TAB+2*128(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_epdouble + +// Multiple 7 + + leaq TAB+6*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+5*128(%rsp), %rbp + callq edwards25519_scalarmuldouble_alt_epadd + +// Multiple 8 + + leaq TAB+7*128(%rsp), %rdi + leaq TAB+3*128(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_epdouble + +// Handle the initialization, starting the loop counter at i = 252 +// and initializing acc to the sum of the table entries for the +// top nybbles of the scalars (the ones with no implicit -8 bias). + + movq $252, %rax + movq %rax, i + +// Index for btable entry... + + movq BSCALAR+24(%rsp), %rax + shrq $60, %rax + movq %rax, bf + +// ...and constant-time indexing based on that index + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + leaq edwards25519_scalarmuldouble_alt_table(%rip), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + + movq %rax, BTABENT(%rsp) + movq %rbx, BTABENT+8(%rsp) + movq %rcx, BTABENT+16(%rsp) + movq %rdx, BTABENT+24(%rsp) + movq %r8, BTABENT+32(%rsp) + movq %r9, BTABENT+40(%rsp) + movq %r10, BTABENT+48(%rsp) + movq %r11, BTABENT+56(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) + +// Index for table entry... + + movq SCALAR+24(%rsp), %rax + shrq $60, %rax + movq %rax, bf + +// ...and constant-time indexing based on that index. +// Do the Y and Z fields first, to save on registers... + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + leaq TAB+32(%rsp), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) + +// ...followed by the X and W fields + + leaq TAB(%rsp), %rbp + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) + +// Add those elements to initialize the accumulator for bit position 252 + + leaq ACC(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp + callq edwards25519_scalarmuldouble_alt_pepadd + +// Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint +// Start with i = 252 for bits 248..251 and go down four at a time to 3..0 + +edwards25519_scalarmuldouble_alt_loop: + + movq i, %rax + subq $4, %rax + movq %rax, i + +// Double to acc' = 2 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_pdouble + +// Get btable entry, first getting the adjusted bitfield... + + movq i, %rax + movq %rax, %rcx + shrq $6, %rax + movq 32(%rsp,%rax,8), %rax + shrq %cl, %rax + andq $15, %rax + + subq $8, %rax + sbbq %rcx, %rcx + xorq %rcx, %rax + subq %rcx, %rax + movq %rcx, cf + movq %rax, bf + +// ... then doing constant-time lookup with the appropriate index... + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + leaq edwards25519_scalarmuldouble_alt_table(%rip), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + addq $96, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + movq 64(%rbp), %rsi + cmovzq %rsi, %r12 + movq 72(%rbp), %rsi + cmovzq %rsi, %r13 + movq 80(%rbp), %rsi + cmovzq %rsi, %r14 + movq 88(%rbp), %rsi + cmovzq %rsi, %r15 + +// ... then optionally negating before storing. The table entry +// is in precomputed form and we currently have +// +// [%rdx;%rcx;%rbx;%rax] = y - x +// [%r11;%r10;%r9;%r8] = x + y +// [%r15;%r14;%r13;%r12] = 2 * d * x * y +// +// Negation for Edwards curves is -(x,y) = (-x,y), which in this modified +// form amounts to swapping the first two fields and negating the third. +// The negation does not always fully reduce even mod 2^256-38 in the zero +// case, instead giving -0 = 2^256-38. But that is fine since the result is +// always fed to a multipliction inside the "pepadd" function below that +// handles any 256-bit input. + + movq cf, %rdi + testq %rdi, %rdi + + movq %rax, %rsi + cmovnzq %r8, %rsi + cmovnzq %rax, %r8 + movq %rsi, BTABENT(%rsp) + movq %r8, BTABENT+32(%rsp) + + movq %rbx, %rsi + cmovnzq %r9, %rsi + cmovnzq %rbx, %r9 + movq %rsi, BTABENT+8(%rsp) + movq %r9, BTABENT+40(%rsp) + + movq %rcx, %rsi + cmovnzq %r10, %rsi + cmovnzq %rcx, %r10 + movq %rsi, BTABENT+16(%rsp) + movq %r10, BTABENT+48(%rsp) + + movq %rdx, %rsi + cmovnzq %r11, %rsi + cmovnzq %rdx, %r11 + movq %rsi, BTABENT+24(%rsp) + movq %r11, BTABENT+56(%rsp) + + xorq %rdi, %r12 + xorq %rdi, %r13 + xorq %rdi, %r14 + xorq %rdi, %r15 + andq $37, %rdi + subq %rdi, %r12 + sbbq $0, %r13 + sbbq $0, %r14 + sbbq $0, %r15 + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) + +// Get table entry, first getting the adjusted bitfield... + + movq i, %rax + movq %rax, %rcx + shrq $6, %rax + movq (%rsp,%rax,8), %rax + shrq %cl, %rax + andq $15, %rax + + subq $8, %rax + sbbq %rcx, %rcx + xorq %rcx, %rax + subq %rcx, %rax + movq %rcx, cf + movq %rax, bf + +// ...and constant-time indexing based on that index +// Do the Y and Z fields first, to save on registers +// and store them back (they don't need any modification) + + movl $1, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + movl $1, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + leaq TAB+32(%rsp), %rbp + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 32(%rbp), %rsi + cmovzq %rsi, %r8 + movq 40(%rbp), %rsi + cmovzq %rsi, %r9 + movq 48(%rbp), %rsi + cmovzq %rsi, %r10 + movq 56(%rbp), %rsi + cmovzq %rsi, %r11 + + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) + +// Now do the X and W fields... + + leaq TAB(%rsp), %rbp + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + + cmpq $1, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $2, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $3, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $4, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $5, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $6, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $7, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + addq $128, %rbp + + cmpq $8, bf + movq (%rbp), %rsi + cmovzq %rsi, %rax + movq 8(%rbp), %rsi + cmovzq %rsi, %rbx + movq 16(%rbp), %rsi + cmovzq %rsi, %rcx + movq 24(%rbp), %rsi + cmovzq %rsi, %rdx + movq 96(%rbp), %rsi + cmovzq %rsi, %r8 + movq 104(%rbp), %rsi + cmovzq %rsi, %r9 + movq 112(%rbp), %rsi + cmovzq %rsi, %r10 + movq 120(%rbp), %rsi + cmovzq %rsi, %r11 + +// ... then optionally negate before storing the X and W fields. This +// time the table entry is extended-projective, and is here: +// +// [%rdx;%rcx;%rbx;%rax] = X +// [tabent+32] = Y +// [tabent+64] = Z +// [%r11;%r10;%r9;%r8] = W +// +// This time we just need to negate the X and the W fields. +// The crude way negation is done can result in values of X or W +// (when initially zero before negation) being exactly equal to +// 2^256-38, but the "pepadd" function handles that correctly. + + movq cf, %rdi + + xorq %rdi, %rax + xorq %rdi, %rbx + xorq %rdi, %rcx + xorq %rdi, %rdx + + xorq %rdi, %r8 + xorq %rdi, %r9 + xorq %rdi, %r10 + xorq %rdi, %r11 + + andq $37, %rdi + + subq %rdi, %rax + sbbq $0, %rbx + sbbq $0, %rcx + sbbq $0, %rdx + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + subq %rdi, %r8 + sbbq $0, %r9 + sbbq $0, %r10 + sbbq $0, %r11 + + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) + +// Double to acc' = 4 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_pdouble + +// Add tabent := tabent + btabent + + leaq TABENT(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp + callq edwards25519_scalarmuldouble_alt_pepadd + +// Double to acc' = 8 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_pdouble + +// Double to acc' = 16 * acc + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + callq edwards25519_scalarmuldouble_alt_epdouble + +// Add table entry, acc := acc + tabent + + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rbp + callq edwards25519_scalarmuldouble_alt_epadd + +// Loop down + + movq i, %rax + testq %rax, %rax + jnz edwards25519_scalarmuldouble_alt_loop + +// Prepare to call the modular inverse function to get tab = 1/z + + leaq TAB(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, tab and acc. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) + xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) + xorl %ebx, %ebx + movq (%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx + movq 0x8(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) + movq 0x18(%rsp), %rax + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx + movq 0x40(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + +// Store result + + movq res, %rdi + leaq ACC(%rsp), %rsi + leaq TAB(%rsp), %rbp + mul_p25519(x_0,x_1,x_2) + + movq res, %rdi + addq $32, %rdi + leaq ACC+32(%rsp), %rsi + leaq TAB(%rsp), %rbp + mul_p25519(x_0,x_1,x_2) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// **************************************************************************** +// Localized versions of subroutines. +// These are close to the standalone functions "edwards25519_epdouble" etc., +// but are only maintaining reduction modulo 2^256 - 38, not 2^255 - 19. +// **************************************************************************** + +edwards25519_scalarmuldouble_alt_epdouble: + sub $(5*NUMSIZE), %rsp + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(w_0,t1,t4) + mul_4(x_0,t1,t3) + add $(5*NUMSIZE), %rsp + ret + +edwards25519_scalarmuldouble_alt_pdouble: + sub $(5*NUMSIZE), %rsp + add_twice4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_4(y_0,t2,t4) + mul_4(z_0,t3,t2) + mul_4(x_0,t1,t3) + add $(5*NUMSIZE), %rsp + ret + +edwards25519_scalarmuldouble_alt_epadd: + sub $(6*NUMSIZE), %rsp + mul_4(t0,w_1,w_2) + sub_twice4(t1,y_1,x_1) + sub_twice4(t2,y_2,x_2) + add_twice4(t3,y_1,x_1) + add_twice4(t4,y_2,x_2) + double_twice4(t5,z_2) + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + load_k25519(t2) + mul_4(t2,t2,t0) + mul_4(t4,z_1,t5) + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + mul_4(w_0,t0,t5) + mul_4(x_0,t0,t1) + mul_4(y_0,t3,t5) + mul_4(z_0,t1,t3) + add $(6*NUMSIZE), %rsp + ret + +edwards25519_scalarmuldouble_alt_pepadd: + sub $(6*NUMSIZE), %rsp + double_twice4(t0,z_1); + sub_twice4(t1,y_1,x_1); + add_twice4(t2,y_1,x_1); + mul_4(t3,w_1,z_2); + mul_4(t1,t1,x_2); + mul_4(t2,t2,y_2); + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + mul_4(z_0,t4,t0); + mul_4(x_0,t5,t4); + mul_4(y_0,t0,t1); + mul_4(w_0,t5,t1); + add $(6*NUMSIZE), %rsp + ret + +// **************************************************************************** +// The precomputed data (all read-only). This is currently part of the same +// text section, which gives position-independent code with simple PC-relative +// addressing. However it could be put in a separate section via something like +// +// .section .rodata +// **************************************************************************** + +// Precomputed table of multiples of generator for edwards25519 +// all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. + +edwards25519_scalarmuldouble_alt_table: + + // 1 * G + + .quad 0x9d103905d740913e + .quad 0xfd399f05d140beb3 + .quad 0xa5c18434688f8a09 + .quad 0x44fd2f9298f81267 + .quad 0x2fbc93c6f58c3b85 + .quad 0xcf932dc6fb8c0e19 + .quad 0x270b4898643d42c2 + .quad 0x07cf9d3a33d4ba65 + .quad 0xabc91205877aaa68 + .quad 0x26d9e823ccaac49e + .quad 0x5a1b7dcbdd43598c + .quad 0x6f117b689f0c65a8 + + // 2 * G + + .quad 0x8a99a56042b4d5a8 + .quad 0x8f2b810c4e60acf6 + .quad 0xe09e236bb16e37aa + .quad 0x6bb595a669c92555 + .quad 0x9224e7fc933c71d7 + .quad 0x9f469d967a0ff5b5 + .quad 0x5aa69a65e1d60702 + .quad 0x590c063fa87d2e2e + .quad 0x43faa8b3a59b7a5f + .quad 0x36c16bdd5d9acf78 + .quad 0x500fa0840b3d6a31 + .quad 0x701af5b13ea50b73 + + // 3 * G + + .quad 0x56611fe8a4fcd265 + .quad 0x3bd353fde5c1ba7d + .quad 0x8131f31a214bd6bd + .quad 0x2ab91587555bda62 + .quad 0xaf25b0a84cee9730 + .quad 0x025a8430e8864b8a + .quad 0xc11b50029f016732 + .quad 0x7a164e1b9a80f8f4 + .quad 0x14ae933f0dd0d889 + .quad 0x589423221c35da62 + .quad 0xd170e5458cf2db4c + .quad 0x5a2826af12b9b4c6 + + // 4 * G + + .quad 0x95fe050a056818bf + .quad 0x327e89715660faa9 + .quad 0xc3e8e3cd06a05073 + .quad 0x27933f4c7445a49a + .quad 0x287351b98efc099f + .quad 0x6765c6f47dfd2538 + .quad 0xca348d3dfb0a9265 + .quad 0x680e910321e58727 + .quad 0x5a13fbe9c476ff09 + .quad 0x6e9e39457b5cc172 + .quad 0x5ddbdcf9102b4494 + .quad 0x7f9d0cbf63553e2b + + // 5 * G + + .quad 0x7f9182c3a447d6ba + .quad 0xd50014d14b2729b7 + .quad 0xe33cf11cb864a087 + .quad 0x154a7e73eb1b55f3 + .quad 0xa212bc4408a5bb33 + .quad 0x8d5048c3c75eed02 + .quad 0xdd1beb0c5abfec44 + .quad 0x2945ccf146e206eb + .quad 0xbcbbdbf1812a8285 + .quad 0x270e0807d0bdd1fc + .quad 0xb41b670b1bbda72d + .quad 0x43aabe696b3bb69a + + // 6 * G + + .quad 0x499806b67b7d8ca4 + .quad 0x575be28427d22739 + .quad 0xbb085ce7204553b9 + .quad 0x38b64c41ae417884 + .quad 0x3a0ceeeb77157131 + .quad 0x9b27158900c8af88 + .quad 0x8065b668da59a736 + .quad 0x51e57bb6a2cc38bd + .quad 0x85ac326702ea4b71 + .quad 0xbe70e00341a1bb01 + .quad 0x53e4a24b083bc144 + .quad 0x10b8e91a9f0d61e3 + + // 7 * G + + .quad 0xba6f2c9aaa3221b1 + .quad 0x6ca021533bba23a7 + .quad 0x9dea764f92192c3a + .quad 0x1d6edd5d2e5317e0 + .quad 0x6b1a5cd0944ea3bf + .quad 0x7470353ab39dc0d2 + .quad 0x71b2528228542e49 + .quad 0x461bea69283c927e + .quad 0xf1836dc801b8b3a2 + .quad 0xb3035f47053ea49a + .quad 0x529c41ba5877adf3 + .quad 0x7a9fbb1c6a0f90a7 + + // 8 * G + + .quad 0xe2a75dedf39234d9 + .quad 0x963d7680e1b558f9 + .quad 0x2c2741ac6e3c23fb + .quad 0x3a9024a1320e01c3 + .quad 0x59b7596604dd3e8f + .quad 0x6cb30377e288702c + .quad 0xb1339c665ed9c323 + .quad 0x0915e76061bce52f + .quad 0xe7c1f5d9c9a2911a + .quad 0xb8a371788bcca7d7 + .quad 0x636412190eb62a32 + .quad 0x26907c5c2ecc4e95 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif