From e618f26dc893ee780dfa01b2f08f93035b12e780 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 1 Nov 2023 18:52:23 -0700 Subject: [PATCH] Switch edwards25519 operations to divstep-based modular inverse This replaces the inlined variant of "bignum_modinv" with code from "bignum_inv_p25519" in all "edwards25519_scalarmul*" functions. Again, there are consequential changes related to the slightly different amount of temporary storage needed by bignum_inv_p25519. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/7e7b18e8fc83fa25131cfac1c94bd83fbf6cd243 --- arm/curve25519/edwards25519_scalarmulbase.S | 1387 +++++++++--- .../edwards25519_scalarmulbase_alt.S | 1387 +++++++++--- arm/curve25519/edwards25519_scalarmuldouble.S | 1401 +++++++++--- .../edwards25519_scalarmuldouble_alt.S | 1401 +++++++++--- .../curve25519/edwards25519_scalarmulbase.S | 1877 +++++++++++---- .../edwards25519_scalarmulbase_alt.S | 1877 +++++++++++---- .../curve25519/edwards25519_scalarmuldouble.S | 2005 ++++++++++++----- .../edwards25519_scalarmuldouble_alt.S | 2005 ++++++++++++----- 8 files changed, 9956 insertions(+), 3384 deletions(-) diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 6ca092489f..8c9d0f9193 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -956,346 +956,1045 @@ edwards25519_scalarmulbase_scalarloop: // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - mov x0, 4 - add x1, w_3 - add x2, z_3 - adr x3, edwards25519_scalarmulbase_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmulbase_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmulbase_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmulbase_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmulbase_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_wmontend -edwards25519_scalarmulbase_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wmontloop -edwards25519_scalarmulbase_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmulbase_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_zmontend -edwards25519_scalarmulbase_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zmontloop -edwards25519_scalarmulbase_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmulbase_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_negskip1 -edwards25519_scalarmulbase_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_negloop1 -edwards25519_scalarmulbase_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_negskip2 -edwards25519_scalarmulbase_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_negloop2 -edwards25519_scalarmulbase_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmulbase_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmulbase_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmulbase_outerloop + add x0, w_3 + add x1, z_3 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, w_3, x_3 +// and y_3. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmulbase_invmidloop +edwards25519_scalarmulbase_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmulbase_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmulbase_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1322,14 +2021,6 @@ edwards25519_scalarmulbase_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmulbase_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index e8dd9114a4..03e5598f2c 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -798,346 +798,1045 @@ edwards25519_scalarmulbase_alt_scalarloop: // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - mov x0, 4 - add x1, w_3 - add x2, z_3 - adr x3, edwards25519_scalarmulbase_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmulbase_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmulbase_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmulbase_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmulbase_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_alt_wmontend -edwards25519_scalarmulbase_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wmontloop -edwards25519_scalarmulbase_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmulbase_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_alt_zmontend -edwards25519_scalarmulbase_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zmontloop -edwards25519_scalarmulbase_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmulbase_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_alt_negskip1 -edwards25519_scalarmulbase_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_alt_negloop1 -edwards25519_scalarmulbase_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_alt_negskip2 -edwards25519_scalarmulbase_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_alt_negloop2 -edwards25519_scalarmulbase_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmulbase_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmulbase_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmulbase_alt_outerloop + add x0, w_3 + add x1, z_3 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, w_3, x_3 +// and y_3. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmulbase_alt_invmidloop +edwards25519_scalarmulbase_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmulbase_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmulbase_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1164,14 +1863,6 @@ edwards25519_scalarmulbase_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmulbase_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S index cd760f1212..00ea37eaaf 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -57,14 +57,14 @@ #define scalar sp, #(0*NUMSIZE) #define bscalar sp, #(1*NUMSIZE) -#define acc sp, #(2*NUMSIZE) -#define acc_x sp, #(2*NUMSIZE) -#define acc_y sp, #(3*NUMSIZE) -#define acc_z sp, #(4*NUMSIZE) -#define acc_w sp, #(5*NUMSIZE) +#define btabent sp, #(2*NUMSIZE) +#define acc sp, #(5*NUMSIZE) +#define acc_x sp, #(5*NUMSIZE) +#define acc_y sp, #(6*NUMSIZE) +#define acc_z sp, #(7*NUMSIZE) +#define acc_w sp, #(8*NUMSIZE) -#define tabent sp, #(6*NUMSIZE) -#define btabent sp, #(10*NUMSIZE) +#define tabent sp, #(9*NUMSIZE) #define tab sp, #(13*NUMSIZE) @@ -1872,347 +1872,1044 @@ edwards25519_scalarmuldouble_loop: // Modular inverse setup - mov x0, #4 - add x1, tabent - add x2, acc+64 - adr x3, edwards25519_scalarmuldouble_p25519 - add x4, btabent - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - -edwards25519_scalarmuldouble_modinv: - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmuldouble_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmuldouble_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmuldouble_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmuldouble_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_wmontend -edwards25519_scalarmuldouble_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wmontloop -edwards25519_scalarmuldouble_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmuldouble_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_zmontend -edwards25519_scalarmuldouble_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zmontloop -edwards25519_scalarmuldouble_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmuldouble_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_negskip1 -edwards25519_scalarmuldouble_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_negloop1 -edwards25519_scalarmuldouble_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_negskip2 -edwards25519_scalarmuldouble_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_negloop2 -edwards25519_scalarmuldouble_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmuldouble_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmuldouble_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmuldouble_outerloop + add x0, tabent + add x1, acc+64 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, acc, tabent. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmuldouble_invmidloop +edwards25519_scalarmuldouble_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmuldouble_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmuldouble_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Store result. Note that these are the only reductions mod 2^255-19 @@ -2330,14 +3027,6 @@ edwards25519_scalarmuldouble_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S index c8fe77c31f..ad05eae1fb 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -57,14 +57,14 @@ #define scalar sp, #(0*NUMSIZE) #define bscalar sp, #(1*NUMSIZE) -#define acc sp, #(2*NUMSIZE) -#define acc_x sp, #(2*NUMSIZE) -#define acc_y sp, #(3*NUMSIZE) -#define acc_z sp, #(4*NUMSIZE) -#define acc_w sp, #(5*NUMSIZE) +#define btabent sp, #(2*NUMSIZE) +#define acc sp, #(5*NUMSIZE) +#define acc_x sp, #(5*NUMSIZE) +#define acc_y sp, #(6*NUMSIZE) +#define acc_z sp, #(7*NUMSIZE) +#define acc_w sp, #(8*NUMSIZE) -#define tabent sp, #(6*NUMSIZE) -#define btabent sp, #(10*NUMSIZE) +#define tabent sp, #(9*NUMSIZE) #define tab sp, #(13*NUMSIZE) @@ -1656,347 +1656,1044 @@ edwards25519_scalarmuldouble_alt_loop: // Modular inverse setup - mov x0, #4 - add x1, tabent - add x2, acc+64 - adr x3, edwards25519_scalarmuldouble_alt_p25519 - add x4, btabent - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - -edwards25519_scalarmuldouble_alt_modinv: - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmuldouble_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmuldouble_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmuldouble_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmuldouble_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_alt_wmontend -edwards25519_scalarmuldouble_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wmontloop -edwards25519_scalarmuldouble_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmuldouble_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_alt_zmontend -edwards25519_scalarmuldouble_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zmontloop -edwards25519_scalarmuldouble_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmuldouble_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_alt_negskip1 -edwards25519_scalarmuldouble_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_alt_negloop1 -edwards25519_scalarmuldouble_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_alt_negskip2 -edwards25519_scalarmuldouble_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_alt_negloop2 -edwards25519_scalarmuldouble_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmuldouble_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmuldouble_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmuldouble_alt_outerloop + add x0, tabent + add x1, acc+64 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, acc, tabent. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmuldouble_alt_invmidloop +edwards25519_scalarmuldouble_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmuldouble_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmuldouble_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Store result. Note that these are the only reductions mod 2^255-19 @@ -2114,14 +2811,6 @@ edwards25519_scalarmuldouble_alt_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_alt_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index a024c9daa4..c44e31724c 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -38,23 +38,22 @@ #define xpy_2 (2*NUMSIZE)(%rsp) #define kxy_2 (3*NUMSIZE)(%rsp) -#define acc (4*NUMSIZE)(%rsp) -#define x_1 (4*NUMSIZE)(%rsp) -#define y_1 (5*NUMSIZE)(%rsp) -#define z_1 (6*NUMSIZE)(%rsp) -#define w_1 (7*NUMSIZE)(%rsp) -#define x_3 (4*NUMSIZE)(%rsp) -#define y_3 (5*NUMSIZE)(%rsp) -#define z_3 (6*NUMSIZE)(%rsp) -#define w_3 (7*NUMSIZE)(%rsp) - -#define tmpspace (8*NUMSIZE)(%rsp) -#define t0 (8*NUMSIZE)(%rsp) -#define t1 (9*NUMSIZE)(%rsp) -#define t2 (10*NUMSIZE)(%rsp) -#define t3 (11*NUMSIZE)(%rsp) -#define t4 (12*NUMSIZE)(%rsp) -#define t5 (13*NUMSIZE)(%rsp) +#define t0 (4*NUMSIZE)(%rsp) +#define t1 (5*NUMSIZE)(%rsp) +#define t2 (6*NUMSIZE)(%rsp) +#define t3 (7*NUMSIZE)(%rsp) +#define t4 (8*NUMSIZE)(%rsp) +#define t5 (9*NUMSIZE)(%rsp) + +#define acc (10*NUMSIZE)(%rsp) +#define x_1 (10*NUMSIZE)(%rsp) +#define y_1 (11*NUMSIZE)(%rsp) +#define z_1 (12*NUMSIZE)(%rsp) +#define w_1 (13*NUMSIZE)(%rsp) +#define x_3 (10*NUMSIZE)(%rsp) +#define y_3 (11*NUMSIZE)(%rsp) +#define z_3 (12*NUMSIZE)(%rsp) +#define w_3 (13*NUMSIZE)(%rsp) // Stable homes for the input result pointer, and other variables @@ -73,6 +72,15 @@ #define NSPACE (15*NUMSIZE+8) +// Syntactic variants to make x86_att version simpler to generate + +#define SCALAR 0 +#define TABENT (1*NUMSIZE) +#define ACC (10*NUMSIZE) +#define X3 (10*NUMSIZE) +#define Z3 (12*NUMSIZE) +#define W3 (13*NUMSIZE) + // Macro wrapping up the basic field multiplication, only trivially // different from a pure function call to bignum_mul_p25519. @@ -337,12 +345,12 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase): pushq %rsi movq %rcx, %rdi movq %rdx, %rsi - callq edwards25519_scalarmulbase_curve25519_x25519base_standard + callq edwards25519_scalarmulbase_standard popq %rsi popq %rdi ret -edwards25519_scalarmulbase_curve25519_x25519base_standard: +edwards25519_scalarmulbase_standard: #endif // Save registers, make room for temps, preserve input arguments. @@ -413,11 +421,11 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // And before we store the scalar, test and reset bit 251 to // initialize the main loop just below. - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) btr $59, %r11 - movq %r11, 24(%rsp) + movq %r11, SCALAR+24(%rsp) // The main part of the computation is in extended-projective coordinates // (X,Y,Z,T), representing an affine point on the edwards25519 curve @@ -428,75 +436,75 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. - leaq edwards25519_scalarmulbase_edwards25519_0g(%rip), %r10 - leaq edwards25519_scalarmulbase_edwards25519_251g(%rip), %r11 + leaq edwards25519_scalarmulbase_0g(%rip), %r10 + leaq edwards25519_scalarmulbase_251g(%rip), %r11 movq (%r10), %rax movq (%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*16(%rsp) + movq %rax, ACC(%rsp) movq 8*1(%r10), %rax movq 8*1(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*17(%rsp) + movq %rax, ACC+8(%rsp) movq 8*2(%r10), %rax movq 8*2(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*18(%rsp) + movq %rax, ACC+16(%rsp) movq 8*3(%r10), %rax movq 8*3(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*19(%rsp) + movq %rax, ACC+24(%rsp) movq 8*4(%r10), %rax movq 8*4(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*20(%rsp) + movq %rax, ACC+32(%rsp) movq 8*5(%r10), %rax movq 8*5(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*21(%rsp) + movq %rax, ACC+40(%rsp) movq 8*6(%r10), %rax movq 8*6(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*22(%rsp) + movq %rax, ACC+48(%rsp) movq 8*7(%r10), %rax movq 8*7(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*23(%rsp) + movq %rax, ACC+56(%rsp) movl $1, %eax - movq %rax, 8*24(%rsp) + movq %rax, ACC+64(%rsp) movl $0, %eax - movq %rax, 8*25(%rsp) - movq %rax, 8*26(%rsp) - movq %rax, 8*27(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) movq 8*8(%r10), %rax movq 8*8(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*28(%rsp) + movq %rax, ACC+96(%rsp) movq 8*9(%r10), %rax movq 8*9(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*29(%rsp) + movq %rax, ACC+104(%rsp) movq 8*10(%r10), %rax movq 8*10(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*30(%rsp) + movq %rax, ACC+112(%rsp) movq 8*11(%r10), %rax movq 8*11(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*31(%rsp) + movq %rax, ACC+120(%rsp) // The counter "i" tracks the bit position for which the scalar has // already been absorbed, starting at 0 and going up in chunks of 4. @@ -512,7 +520,7 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // end because we made sure bit 251 is clear in the reduced scalar. movq $0, i - leaq edwards25519_scalarmulbase_edwards25519_gtable(%rip), %rax + leaq edwards25519_scalarmulbase_gtable(%rip), %rax movq %rax, tab movq $0, bias @@ -804,26 +812,26 @@ edwards25519_scalarmulbase_scalarloop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 32(%rsp) - movq %r8, 64(%rsp) + movq %rsi, TABENT(%rsp) + movq %r8, TABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 40(%rsp) - movq %r9, 72(%rsp) + movq %rsi, TABENT+8(%rsp) + movq %r9, TABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 48(%rsp) - movq %r10, 80(%rsp) + movq %rsi, TABENT+16(%rsp) + movq %r10, TABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 56(%rsp) - movq %r11, 88(%rsp) + movq %rsi, TABENT+24(%rsp) + movq %r11, TABENT+56(%rsp) movq $-19, %rax movq $-1, %rbx @@ -844,10 +852,10 @@ edwards25519_scalarmulbase_scalarloop: cmovzq %r13, %rbx cmovzq %r14, %rcx cmovzq %r15, %rdx - movq %rax, 96(%rsp) - movq %rbx, 104(%rsp) - movq %rcx, 112(%rsp) - movq %rdx, 120(%rsp) + movq %rax, TABENT+64(%rsp) + movq %rbx, TABENT+72(%rsp) + movq %rcx, TABENT+80(%rsp) + movq %rdx, TABENT+88(%rsp) // Extended-projective and precomputed mixed addition. // This is effectively the same as calling the standalone @@ -884,10 +892,10 @@ edwards25519_scalarmulbase_scalarloop: // point on we don't need any normalization of the coordinates // except for making sure that they fit in 4 digits. - movq 128(%rsp), %r8 - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 + movq X3(%rsp), %r8 + movq X3+8(%rsp), %r9 + movq X3+16(%rsp), %r10 + movq X3+24(%rsp), %r11 movq $0xffffffffffffffda, %r12 subq %r8, %r12 movq $0xffffffffffffffff, %r13 @@ -896,424 +904,1377 @@ edwards25519_scalarmulbase_scalarloop: sbbq %r10, %r14 movq $0xffffffffffffffff, %r15 sbbq %r11, %r15 - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax btq $63, %rax cmovcq %r12, %r8 cmovcq %r13, %r9 cmovcq %r14, %r10 cmovcq %r15, %r11 - movq %r8, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) + movq %r8, X3(%rsp) + movq %r9, X3+8(%rsp) + movq %r10, X3+16(%rsp) + movq %r11, X3+24(%rsp) // Now we need to map out of the extended-projective representation // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq edwards25519_scalarmulbase_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq W3(%rsp), %rdi + leaq Z3(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, x_3, y_3, +// z_3 and w_3. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmulbase_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmulbase_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmulbase_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp edwards25519_scalarmulbase_midloop +edwards25519_scalarmulbase_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmulbase_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_wmontend -edwards25519_scalarmulbase_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_wmontloop -edwards25519_scalarmulbase_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_wcorrloop + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_zmontend -edwards25519_scalarmulbase_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_zmontloop -edwards25519_scalarmulbase_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmulbase_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +edwards25519_scalarmulbase_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmulbase_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmulbase_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmulbase_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmulbase_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne edwards25519_scalarmulbase_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1344,18 +2305,10 @@ edwards25519_scalarmulbase_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -edwards25519_scalarmulbase_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. -edwards25519_scalarmulbase_edwards25519_0g: +edwards25519_scalarmulbase_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 @@ -1372,7 +2325,7 @@ edwards25519_scalarmulbase_edwards25519_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 -edwards25519_scalarmulbase_edwards25519_251g: +edwards25519_scalarmulbase_251g: .quad 0x525f946d7c7220e7 .quad 0x4636b0b2f1e35444 @@ -1390,7 +2343,7 @@ edwards25519_scalarmulbase_edwards25519_251g: // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. -edwards25519_scalarmulbase_edwards25519_gtable: +edwards25519_scalarmulbase_gtable: // 2^0 * 1 * G diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index e66492083f..00b91fe1aa 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -38,23 +38,22 @@ #define xpy_2 (2*NUMSIZE)(%rsp) #define kxy_2 (3*NUMSIZE)(%rsp) -#define acc (4*NUMSIZE)(%rsp) -#define x_1 (4*NUMSIZE)(%rsp) -#define y_1 (5*NUMSIZE)(%rsp) -#define z_1 (6*NUMSIZE)(%rsp) -#define w_1 (7*NUMSIZE)(%rsp) -#define x_3 (4*NUMSIZE)(%rsp) -#define y_3 (5*NUMSIZE)(%rsp) -#define z_3 (6*NUMSIZE)(%rsp) -#define w_3 (7*NUMSIZE)(%rsp) - -#define tmpspace (8*NUMSIZE)(%rsp) -#define t0 (8*NUMSIZE)(%rsp) -#define t1 (9*NUMSIZE)(%rsp) -#define t2 (10*NUMSIZE)(%rsp) -#define t3 (11*NUMSIZE)(%rsp) -#define t4 (12*NUMSIZE)(%rsp) -#define t5 (13*NUMSIZE)(%rsp) +#define t0 (4*NUMSIZE)(%rsp) +#define t1 (5*NUMSIZE)(%rsp) +#define t2 (6*NUMSIZE)(%rsp) +#define t3 (7*NUMSIZE)(%rsp) +#define t4 (8*NUMSIZE)(%rsp) +#define t5 (9*NUMSIZE)(%rsp) + +#define acc (10*NUMSIZE)(%rsp) +#define x_1 (10*NUMSIZE)(%rsp) +#define y_1 (11*NUMSIZE)(%rsp) +#define z_1 (12*NUMSIZE)(%rsp) +#define w_1 (13*NUMSIZE)(%rsp) +#define x_3 (10*NUMSIZE)(%rsp) +#define y_3 (11*NUMSIZE)(%rsp) +#define z_3 (12*NUMSIZE)(%rsp) +#define w_3 (13*NUMSIZE)(%rsp) // Stable homes for the input result pointer, and other variables @@ -73,6 +72,15 @@ #define NSPACE (15*NUMSIZE+8) +// Syntactic variants to make x86_att version simpler to generate + +#define SCALAR 0 +#define TABENT (1*NUMSIZE) +#define ACC (10*NUMSIZE) +#define X3 (10*NUMSIZE) +#define Z3 (12*NUMSIZE) +#define W3 (13*NUMSIZE) + // Macro wrapping up the basic field multiplication, only trivially // different from a pure function call to bignum_mul_p25519_alt. @@ -413,12 +421,12 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): pushq %rsi movq %rcx, %rdi movq %rdx, %rsi - callq edwards25519_scalarmulbase_alt_curve25519_x25519base_standard + callq edwards25519_scalarmulbase_alt_standard popq %rsi popq %rdi ret -edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: +edwards25519_scalarmulbase_alt_standard: #endif // Save registers, make room for temps, preserve input arguments. @@ -489,11 +497,11 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // And before we store the scalar, test and reset bit 251 to // initialize the main loop just below. - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) btr $59, %r11 - movq %r11, 24(%rsp) + movq %r11, SCALAR+24(%rsp) // The main part of the computation is in extended-projective coordinates // (X,Y,Z,T), representing an affine point on the edwards25519 curve @@ -504,75 +512,75 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. - leaq edwards25519_scalarmulbase_alt_edwards25519_0g(%rip), %r10 - leaq edwards25519_scalarmulbase_alt_edwards25519_251g(%rip), %r11 + leaq edwards25519_scalarmulbase_alt_0g(%rip), %r10 + leaq edwards25519_scalarmulbase_alt_251g(%rip), %r11 movq (%r10), %rax movq (%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*16(%rsp) + movq %rax, ACC(%rsp) movq 8*1(%r10), %rax movq 8*1(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*17(%rsp) + movq %rax, ACC+8(%rsp) movq 8*2(%r10), %rax movq 8*2(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*18(%rsp) + movq %rax, ACC+16(%rsp) movq 8*3(%r10), %rax movq 8*3(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*19(%rsp) + movq %rax, ACC+24(%rsp) movq 8*4(%r10), %rax movq 8*4(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*20(%rsp) + movq %rax, ACC+32(%rsp) movq 8*5(%r10), %rax movq 8*5(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*21(%rsp) + movq %rax, ACC+40(%rsp) movq 8*6(%r10), %rax movq 8*6(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*22(%rsp) + movq %rax, ACC+48(%rsp) movq 8*7(%r10), %rax movq 8*7(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*23(%rsp) + movq %rax, ACC+56(%rsp) movl $1, %eax - movq %rax, 8*24(%rsp) + movq %rax, ACC+64(%rsp) movl $0, %eax - movq %rax, 8*25(%rsp) - movq %rax, 8*26(%rsp) - movq %rax, 8*27(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) movq 8*8(%r10), %rax movq 8*8(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*28(%rsp) + movq %rax, ACC+96(%rsp) movq 8*9(%r10), %rax movq 8*9(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*29(%rsp) + movq %rax, ACC+104(%rsp) movq 8*10(%r10), %rax movq 8*10(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*30(%rsp) + movq %rax, ACC+112(%rsp) movq 8*11(%r10), %rax movq 8*11(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*31(%rsp) + movq %rax, ACC+120(%rsp) // The counter "i" tracks the bit position for which the scalar has // already been absorbed, starting at 0 and going up in chunks of 4. @@ -588,7 +596,7 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // end because we made sure bit 251 is clear in the reduced scalar. movq $0, i - leaq edwards25519_scalarmulbase_alt_edwards25519_gtable(%rip), %rax + leaq edwards25519_scalarmulbase_alt_gtable(%rip), %rax movq %rax, tab movq $0, bias @@ -880,26 +888,26 @@ edwards25519_scalarmulbase_alt_scalarloop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 32(%rsp) - movq %r8, 64(%rsp) + movq %rsi, TABENT(%rsp) + movq %r8, TABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 40(%rsp) - movq %r9, 72(%rsp) + movq %rsi, TABENT+8(%rsp) + movq %r9, TABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 48(%rsp) - movq %r10, 80(%rsp) + movq %rsi, TABENT+16(%rsp) + movq %r10, TABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 56(%rsp) - movq %r11, 88(%rsp) + movq %rsi, TABENT+24(%rsp) + movq %r11, TABENT+56(%rsp) movq $-19, %rax movq $-1, %rbx @@ -920,10 +928,10 @@ edwards25519_scalarmulbase_alt_scalarloop: cmovzq %r13, %rbx cmovzq %r14, %rcx cmovzq %r15, %rdx - movq %rax, 96(%rsp) - movq %rbx, 104(%rsp) - movq %rcx, 112(%rsp) - movq %rdx, 120(%rsp) + movq %rax, TABENT+64(%rsp) + movq %rbx, TABENT+72(%rsp) + movq %rcx, TABENT+80(%rsp) + movq %rdx, TABENT+88(%rsp) // Extended-projective and precomputed mixed addition. // This is effectively the same as calling the standalone @@ -960,10 +968,10 @@ edwards25519_scalarmulbase_alt_scalarloop: // point on we don't need any normalization of the coordinates // except for making sure that they fit in 4 digits. - movq 128(%rsp), %r8 - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 + movq X3(%rsp), %r8 + movq X3+8(%rsp), %r9 + movq X3+16(%rsp), %r10 + movq X3+24(%rsp), %r11 movq $0xffffffffffffffda, %r12 subq %r8, %r12 movq $0xffffffffffffffff, %r13 @@ -972,424 +980,1377 @@ edwards25519_scalarmulbase_alt_scalarloop: sbbq %r10, %r14 movq $0xffffffffffffffff, %r15 sbbq %r11, %r15 - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax btq $63, %rax cmovcq %r12, %r8 cmovcq %r13, %r9 cmovcq %r14, %r10 cmovcq %r15, %r11 - movq %r8, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) + movq %r8, X3(%rsp) + movq %r9, X3+8(%rsp) + movq %r10, X3+16(%rsp) + movq %r11, X3+24(%rsp) // Now we need to map out of the extended-projective representation // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq edwards25519_scalarmulbase_alt_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq W3(%rsp), %rdi + leaq Z3(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, x_3, y_3, +// z_3 and w_3. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmulbase_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmulbase_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp edwards25519_scalarmulbase_alt_midloop +edwards25519_scalarmulbase_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmulbase_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_alt_wmontend -edwards25519_scalarmulbase_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_wmontloop -edwards25519_scalarmulbase_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_wcorrloop + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_alt_zmontend -edwards25519_scalarmulbase_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_zmontloop -edwards25519_scalarmulbase_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmulbase_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +edwards25519_scalarmulbase_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmulbase_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmulbase_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmulbase_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmulbase_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne edwards25519_scalarmulbase_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1420,18 +2381,10 @@ edwards25519_scalarmulbase_alt_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -edwards25519_scalarmulbase_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. -edwards25519_scalarmulbase_alt_edwards25519_0g: +edwards25519_scalarmulbase_alt_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 @@ -1448,7 +2401,7 @@ edwards25519_scalarmulbase_alt_edwards25519_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 -edwards25519_scalarmulbase_alt_edwards25519_251g: +edwards25519_scalarmulbase_alt_251g: .quad 0x525f946d7c7220e7 .quad 0x4636b0b2f1e35444 @@ -1466,7 +2419,7 @@ edwards25519_scalarmulbase_alt_edwards25519_251g: // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. -edwards25519_scalarmulbase_alt_edwards25519_gtable: +edwards25519_scalarmulbase_alt_gtable: // 2^0 * 1 * G diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index 0138d1a4b2..35fd7f4ffc 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -42,24 +42,33 @@ #define scalar (0*NUMSIZE)(%rsp) #define bscalar (1*NUMSIZE)(%rsp) -#define acc (3*NUMSIZE)(%rsp) +#define tabent (2*NUMSIZE)(%rsp) +#define btabent (6*NUMSIZE)(%rsp) -#define tabent (7*NUMSIZE)(%rsp) -#define btabent (11*NUMSIZE)(%rsp) +#define acc (9*NUMSIZE)(%rsp) -#define tab (14*NUMSIZE)(%rsp) +#define tab (13*NUMSIZE)(%rsp) // Additional variables kept on the stack -#define bf 2*NUMSIZE(%rsp) -#define cf 2*NUMSIZE+8(%rsp) -#define i 2*NUMSIZE+16(%rsp) -#define res 2*NUMSIZE+24(%rsp) +#define bf 45*NUMSIZE(%rsp) +#define cf 45*NUMSIZE+8(%rsp) +#define i 45*NUMSIZE+16(%rsp) +#define res 45*NUMSIZE+24(%rsp) // Total size to reserve on the stack (excluding local subroutines) #define NSPACE (46*NUMSIZE) +// Syntactic variants to make x86_att forms easier to generate + +#define SCALAR (0*NUMSIZE) +#define BSCALAR (1*NUMSIZE) +#define TABENT (2*NUMSIZE) +#define BTABENT (6*NUMSIZE) +#define ACC (9*NUMSIZE) +#define TAB (13*NUMSIZE) + // Sub-references used in local subroutines with local stack #define x_0 0(%rdi) @@ -493,10 +502,10 @@ edwards25519_scalarmuldouble_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) + movq %r8, BSCALAR(%rsp) + movq %r9, BSCALAR+8(%rsp) + movq %r10, BSCALAR+16(%rsp) + movq %r11, BSCALAR+24(%rsp) movq (%rsi), %r8 movq 8(%rsi), %r9 @@ -517,10 +526,10 @@ edwards25519_scalarmuldouble_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) + movq %r11, SCALAR+24(%rsp) // Create table of multiples 1..8 of the general input point at "tab". // Reduce the input coordinates x and y modulo 2^256 - 38 first, for the @@ -541,13 +550,13 @@ edwards25519_scalarmuldouble_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 448(%rsp) + movq %rax, TAB(%rsp) cmovncq %r9, %rbx - movq %rbx, 456(%rsp) + movq %rbx, TAB+8(%rsp) cmovncq %r10, %rcx - movq %rcx, 464(%rsp) + movq %rcx, TAB+16(%rsp) cmovncq %r11, %rsi - movq %rsi, 472(%rsp) + movq %rsi, TAB+24(%rsp) movl $38, %eax movq 32(%rdx), %r8 @@ -562,69 +571,69 @@ edwards25519_scalarmuldouble_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 480(%rsp) + movq %rax, TAB+32(%rsp) cmovncq %r9, %rbx - movq %rbx, 488(%rsp) + movq %rbx, TAB+40(%rsp) cmovncq %r10, %rcx - movq %rcx, 496(%rsp) + movq %rcx, TAB+48(%rsp) cmovncq %r11, %rsi - movq %rsi, 504(%rsp) + movq %rsi, TAB+56(%rsp) movl $1, %eax - movq %rax, 512(%rsp) + movq %rax, TAB+64(%rsp) xorl %eax, %eax - movq %rax, 520(%rsp) - movq %rax, 528(%rsp) - movq %rax, 536(%rsp) + movq %rax, TAB+72(%rsp) + movq %rax, TAB+80(%rsp) + movq %rax, TAB+88(%rsp) - leaq 544(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 480(%rsp), %rbp + leaq TAB+96(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+32(%rsp), %rbp mul_4(x_0,x_1,x_2) // Multiple 2 - leaq 576(%rsp), %rdi - leaq 448(%rsp), %rsi + leaq TAB+1*128(%rsp), %rdi + leaq TAB(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Multiple 3 - leaq 704(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 576(%rsp), %rbp + leaq TAB+2*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+1*128(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Multiple 4 - leaq 832(%rsp), %rdi - leaq 576(%rsp), %rsi + leaq TAB+3*128(%rsp), %rdi + leaq TAB+1*128(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Multiple 5 - leaq 960(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 832(%rsp), %rbp + leaq TAB+4*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+3*128(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Multiple 6 - leaq 1088(%rsp), %rdi - leaq 704(%rsp), %rsi + leaq TAB+5*128(%rsp), %rdi + leaq TAB+2*128(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Multiple 7 - leaq 1216(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 1088(%rsp), %rbp + leaq TAB+6*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+5*128(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Multiple 8 - leaq 1344(%rsp), %rdi - leaq 832(%rsp), %rsi + leaq TAB+7*128(%rsp), %rdi + leaq TAB+3*128(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Handle the initialization, starting the loop counter at i = 252 @@ -636,7 +645,7 @@ edwards25519_scalarmuldouble_standard: // Index for btable entry... - movq 56(%rsp), %rax + movq BSCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -872,22 +881,22 @@ edwards25519_scalarmuldouble_standard: movq 88(%rbp), %rsi cmovzq %rsi, %r15 - movq %rax, 352(%rsp) - movq %rbx, 360(%rsp) - movq %rcx, 368(%rsp) - movq %rdx, 376(%rsp) - movq %r8, 384(%rsp) - movq %r9, 392(%rsp) - movq %r10, 400(%rsp) - movq %r11, 408(%rsp) - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %rax, BTABENT(%rsp) + movq %rbx, BTABENT+8(%rsp) + movq %rcx, BTABENT+16(%rsp) + movq %rdx, BTABENT+24(%rsp) + movq %r8, BTABENT+32(%rsp) + movq %r9, BTABENT+40(%rsp) + movq %r10, BTABENT+48(%rsp) + movq %r11, BTABENT+56(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Index for table entry... - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -903,7 +912,7 @@ edwards25519_scalarmuldouble_standard: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1056,18 +1065,18 @@ edwards25519_scalarmuldouble_standard: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // ...followed by the X and W fields - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -1229,20 +1238,20 @@ edwards25519_scalarmuldouble_standard: movq 120(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Add those elements to initialize the accumulator for bit position 252 - leaq 96(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_pepadd // Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint @@ -1256,8 +1265,8 @@ edwards25519_scalarmuldouble_loop: // Double to acc' = 2 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_pdouble // Get btable entry, first getting the adjusted bitfield... @@ -1528,26 +1537,26 @@ edwards25519_scalarmuldouble_loop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 352(%rsp) - movq %r8, 384(%rsp) + movq %rsi, BTABENT(%rsp) + movq %r8, BTABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 360(%rsp) - movq %r9, 392(%rsp) + movq %rsi, BTABENT+8(%rsp) + movq %r9, BTABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 368(%rsp) - movq %r10, 400(%rsp) + movq %rsi, BTABENT+16(%rsp) + movq %r10, BTABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 376(%rsp) - movq %r11, 408(%rsp) + movq %rsi, BTABENT+24(%rsp) + movq %r11, BTABENT+56(%rsp) xorq %rdi, %r12 xorq %rdi, %r13 @@ -1558,10 +1567,10 @@ edwards25519_scalarmuldouble_loop: sbbq $0, %r13 sbbq $0, %r14 sbbq $0, %r15 - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Get table entry, first getting the adjusted bitfield... @@ -1592,7 +1601,7 @@ edwards25519_scalarmuldouble_loop: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1745,18 +1754,18 @@ edwards25519_scalarmuldouble_loop: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // Now do the X and W fields... - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -1950,51 +1959,51 @@ edwards25519_scalarmuldouble_loop: sbbq $0, %rcx sbbq $0, %rdx - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) subq %rdi, %r8 sbbq $0, %r9 sbbq $0, %r10 sbbq $0, %r11 - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Double to acc' = 4 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_pdouble // Add tabent := tabent + btabent - leaq 224(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq TABENT(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_pepadd // Double to acc' = 8 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_pdouble // Double to acc' = 16 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Add table entry, acc := acc + tabent - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Loop down @@ -2003,423 +2012,1375 @@ edwards25519_scalarmuldouble_loop: testq %rax, %rax jnz edwards25519_scalarmuldouble_loop -// Modular inverse setup +// Prepare to call the modular inverse function to get tab = 1/z - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq edwards25519_scalarmuldouble_p25519(%rip), %rcx - leaq 352(%rsp), %r8 + leaq TAB(%rsp), %rdi + leaq ACC+64(%rsp), %rsi -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, tab and acc. - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmuldouble_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmuldouble_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmuldouble_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmuldouble_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_wmontend -edwards25519_scalarmuldouble_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_wmontloop -edwards25519_scalarmuldouble_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_zmontend -edwards25519_scalarmuldouble_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_zmontloop -edwards25519_scalarmuldouble_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmuldouble_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmuldouble_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmuldouble_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmuldouble_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmuldouble_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Store result movq res, %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) movq res, %rdi addq $32, %rdi - leaq 128(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC+32(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) // Restore stack and registers @@ -2528,14 +3489,6 @@ edwards25519_scalarmuldouble_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index 7f3dffa395..e17d10b47a 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -42,24 +42,33 @@ #define scalar (0*NUMSIZE)(%rsp) #define bscalar (1*NUMSIZE)(%rsp) -#define acc (3*NUMSIZE)(%rsp) +#define tabent (2*NUMSIZE)(%rsp) +#define btabent (6*NUMSIZE)(%rsp) -#define tabent (7*NUMSIZE)(%rsp) -#define btabent (11*NUMSIZE)(%rsp) +#define acc (9*NUMSIZE)(%rsp) -#define tab (14*NUMSIZE)(%rsp) +#define tab (13*NUMSIZE)(%rsp) // Additional variables kept on the stack -#define bf 2*NUMSIZE(%rsp) -#define cf 2*NUMSIZE+8(%rsp) -#define i 2*NUMSIZE+16(%rsp) -#define res 2*NUMSIZE+24(%rsp) +#define bf 45*NUMSIZE(%rsp) +#define cf 45*NUMSIZE+8(%rsp) +#define i 45*NUMSIZE+16(%rsp) +#define res 45*NUMSIZE+24(%rsp) // Total size to reserve on the stack (excluding local subroutines) #define NSPACE (46*NUMSIZE) +// Syntactic variants to make x86_att forms easier to generate + +#define SCALAR (0*NUMSIZE) +#define BSCALAR (1*NUMSIZE) +#define TABENT (2*NUMSIZE) +#define BTABENT (6*NUMSIZE) +#define ACC (9*NUMSIZE) +#define TAB (13*NUMSIZE) + // Sub-references used in local subroutines with local stack #define x_0 0(%rdi) @@ -610,10 +619,10 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) + movq %r8, BSCALAR(%rsp) + movq %r9, BSCALAR+8(%rsp) + movq %r10, BSCALAR+16(%rsp) + movq %r11, BSCALAR+24(%rsp) movq (%rsi), %r8 movq 8(%rsi), %r9 @@ -634,10 +643,10 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) + movq %r11, SCALAR+24(%rsp) // Create table of multiples 1..8 of the general input point at "tab". // Reduce the input coordinates x and y modulo 2^256 - 38 first, for the @@ -658,13 +667,13 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 448(%rsp) + movq %rax, TAB(%rsp) cmovncq %r9, %rbx - movq %rbx, 456(%rsp) + movq %rbx, TAB+8(%rsp) cmovncq %r10, %rcx - movq %rcx, 464(%rsp) + movq %rcx, TAB+16(%rsp) cmovncq %r11, %rsi - movq %rsi, 472(%rsp) + movq %rsi, TAB+24(%rsp) movl $38, %eax movq 32(%rdx), %r8 @@ -679,69 +688,69 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 480(%rsp) + movq %rax, TAB+32(%rsp) cmovncq %r9, %rbx - movq %rbx, 488(%rsp) + movq %rbx, TAB+40(%rsp) cmovncq %r10, %rcx - movq %rcx, 496(%rsp) + movq %rcx, TAB+48(%rsp) cmovncq %r11, %rsi - movq %rsi, 504(%rsp) + movq %rsi, TAB+56(%rsp) movl $1, %eax - movq %rax, 512(%rsp) + movq %rax, TAB+64(%rsp) xorl %eax, %eax - movq %rax, 520(%rsp) - movq %rax, 528(%rsp) - movq %rax, 536(%rsp) + movq %rax, TAB+72(%rsp) + movq %rax, TAB+80(%rsp) + movq %rax, TAB+88(%rsp) - leaq 544(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 480(%rsp), %rbp + leaq TAB+96(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+32(%rsp), %rbp mul_4(x_0,x_1,x_2) // Multiple 2 - leaq 576(%rsp), %rdi - leaq 448(%rsp), %rsi + leaq TAB+1*128(%rsp), %rdi + leaq TAB(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Multiple 3 - leaq 704(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 576(%rsp), %rbp + leaq TAB+2*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+1*128(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Multiple 4 - leaq 832(%rsp), %rdi - leaq 576(%rsp), %rsi + leaq TAB+3*128(%rsp), %rdi + leaq TAB+1*128(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Multiple 5 - leaq 960(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 832(%rsp), %rbp + leaq TAB+4*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+3*128(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Multiple 6 - leaq 1088(%rsp), %rdi - leaq 704(%rsp), %rsi + leaq TAB+5*128(%rsp), %rdi + leaq TAB+2*128(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Multiple 7 - leaq 1216(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 1088(%rsp), %rbp + leaq TAB+6*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+5*128(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Multiple 8 - leaq 1344(%rsp), %rdi - leaq 832(%rsp), %rsi + leaq TAB+7*128(%rsp), %rdi + leaq TAB+3*128(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Handle the initialization, starting the loop counter at i = 252 @@ -753,7 +762,7 @@ edwards25519_scalarmuldouble_alt_standard: // Index for btable entry... - movq 56(%rsp), %rax + movq BSCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -989,22 +998,22 @@ edwards25519_scalarmuldouble_alt_standard: movq 88(%rbp), %rsi cmovzq %rsi, %r15 - movq %rax, 352(%rsp) - movq %rbx, 360(%rsp) - movq %rcx, 368(%rsp) - movq %rdx, 376(%rsp) - movq %r8, 384(%rsp) - movq %r9, 392(%rsp) - movq %r10, 400(%rsp) - movq %r11, 408(%rsp) - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %rax, BTABENT(%rsp) + movq %rbx, BTABENT+8(%rsp) + movq %rcx, BTABENT+16(%rsp) + movq %rdx, BTABENT+24(%rsp) + movq %r8, BTABENT+32(%rsp) + movq %r9, BTABENT+40(%rsp) + movq %r10, BTABENT+48(%rsp) + movq %r11, BTABENT+56(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Index for table entry... - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -1020,7 +1029,7 @@ edwards25519_scalarmuldouble_alt_standard: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1173,18 +1182,18 @@ edwards25519_scalarmuldouble_alt_standard: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // ...followed by the X and W fields - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -1346,20 +1355,20 @@ edwards25519_scalarmuldouble_alt_standard: movq 120(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Add those elements to initialize the accumulator for bit position 252 - leaq 96(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_pepadd // Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint @@ -1373,8 +1382,8 @@ edwards25519_scalarmuldouble_alt_loop: // Double to acc' = 2 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_pdouble // Get btable entry, first getting the adjusted bitfield... @@ -1645,26 +1654,26 @@ edwards25519_scalarmuldouble_alt_loop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 352(%rsp) - movq %r8, 384(%rsp) + movq %rsi, BTABENT(%rsp) + movq %r8, BTABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 360(%rsp) - movq %r9, 392(%rsp) + movq %rsi, BTABENT+8(%rsp) + movq %r9, BTABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 368(%rsp) - movq %r10, 400(%rsp) + movq %rsi, BTABENT+16(%rsp) + movq %r10, BTABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 376(%rsp) - movq %r11, 408(%rsp) + movq %rsi, BTABENT+24(%rsp) + movq %r11, BTABENT+56(%rsp) xorq %rdi, %r12 xorq %rdi, %r13 @@ -1675,10 +1684,10 @@ edwards25519_scalarmuldouble_alt_loop: sbbq $0, %r13 sbbq $0, %r14 sbbq $0, %r15 - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Get table entry, first getting the adjusted bitfield... @@ -1709,7 +1718,7 @@ edwards25519_scalarmuldouble_alt_loop: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1862,18 +1871,18 @@ edwards25519_scalarmuldouble_alt_loop: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // Now do the X and W fields... - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -2067,51 +2076,51 @@ edwards25519_scalarmuldouble_alt_loop: sbbq $0, %rcx sbbq $0, %rdx - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) subq %rdi, %r8 sbbq $0, %r9 sbbq $0, %r10 sbbq $0, %r11 - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Double to acc' = 4 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_pdouble // Add tabent := tabent + btabent - leaq 224(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq TABENT(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_pepadd // Double to acc' = 8 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_pdouble // Double to acc' = 16 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Add table entry, acc := acc + tabent - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Loop down @@ -2120,423 +2129,1375 @@ edwards25519_scalarmuldouble_alt_loop: testq %rax, %rax jnz edwards25519_scalarmuldouble_alt_loop -// Modular inverse setup +// Prepare to call the modular inverse function to get tab = 1/z - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq edwards25519_scalarmuldouble_alt_p25519(%rip), %rcx - leaq 352(%rsp), %r8 + leaq TAB(%rsp), %rdi + leaq ACC+64(%rsp), %rsi -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, tab and acc. - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmuldouble_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmuldouble_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmuldouble_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_alt_wmontend -edwards25519_scalarmuldouble_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_wmontloop -edwards25519_scalarmuldouble_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_alt_zmontend -edwards25519_scalarmuldouble_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_zmontloop -edwards25519_scalarmuldouble_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmuldouble_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmuldouble_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmuldouble_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmuldouble_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmuldouble_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Store result movq res, %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) movq res, %rdi addq $32, %rdi - leaq 128(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC+32(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) // Restore stack and registers @@ -2645,14 +3606,6 @@ edwards25519_scalarmuldouble_alt_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_alt_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples.