Merge pull request #87 from jargh/main

Ed25519 support and related updates s2n-bignum original commit: awslabs/s2n-bignum@db8409d
aws · Nov 3, 2023 · 462f117 · 462f117
2 parents 519f95f + e618f26
commit 462f117
Show file tree

Hide file tree

Showing 28 changed files with 35,392 additions and 6,048 deletions.
diff --git a/arm/curve25519/bignum_mod_n25519.S b/arm/curve25519/bignum_mod_n25519.S
@@ -0,0 +1,186 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_n25519
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the order of the curve25519/edwards25519 basepoint,
+// which is n_25519 = 2^252 + 27742317777372353535851937790883648493
+//
+// Standard ARM ABI: X0 = z, X1 = k, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519)
+        .text
+        .balign 4
+
+#define z x0
+#define k x1
+#define x x2
+
+#define m0 x3
+#define m1 x4
+#define m2 x5
+#define m3 x6
+
+#define t0 x7
+#define t1 x8
+#define t2 x9
+#define t3 x10
+
+#define n0 x11
+#define n1 x12
+
+// These two are aliased: we only load d when finished with q
+
+#define q x13
+#define d x13
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0;                                             \
+        movk    nn, n1, lsl #16;                                    \
+        movk    nn, n2, lsl #32;                                    \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_n25519):
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmp     k, #4
+        bcc     short
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+// This [m3;m2;m1;m0] is the initial x where we begin reduction.
+
+        sub     k, k, #4
+        lsl     t0, k, #3
+        add     t0, t0, x
+        ldp     m2, m3, [t0, #16]
+        ldp     m0, m1, [t0]
+
+// Load the complicated two words of n_25519 = 2^252 + [n1; n0]
+
+        movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed)
+        movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6)
+
+// Get the quotient estimate q = floor(x/2^252).
+// Also delete it from m3, in effect doing x' = x - q * 2^252
+
+        lsr     q, m3, #60
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF
+
+// Multiply [t2;t1;t0] = q * [n1;n0]
+
+        mul     t0, n0, q
+        mul     t1, n1, q
+        umulh   t2, n0, q
+        adds    t1, t1, t2
+        umulh   t2, n1, q
+        adc     t2, t2, xzr
+
+// Subtract [m3;m2;m1;m0] = x' - q * [n1;n0] = x - q * n_25519
+
+        subs    m0, m0, t0
+        sbcs    m1, m1, t1
+        sbcs    m2, m2, t2
+        sbcs    m3, m3, xzr
+
+// If this borrows (CF = 0 because of inversion), add back n_25519.
+// The masked n3 digit exploits the fact that bit 60 of n0 is set.
+
+        csel    t0, n0, xzr, cc
+        csel    t1, n1, xzr, cc
+        adds    m0, m0, t0
+        adcs    m1, m1, t1
+        and     t0, t0, #0x1000000000000000
+        adcs    m2, m2, xzr
+        adc     m3, m3, t0
+
+// Now do (k-4) iterations of 5->4 word modular reduction. Each one
+// is similar to the sequence above except for the more refined quotient
+// estimation process.
+
+        cbz     k, writeback
+
+loop:
+
+// Assume that the new 5-digit x is 2^64 * previous_x + next_digit.
+// Get the quotient estimate q = max (floor(x/2^252)) (2^64 - 1)
+// and first compute x' = x - 2^252 * q.
+
+        extr    q, m3, m2, #60
+        and     m2, m2, #0x0FFFFFFFFFFFFFFF
+        sub     q, q, m3, lsr #60
+        and     m3, m3, #0xF000000000000000
+        add     m2, m2, m3
+
+// Multiply [t2;t1;t0] = q * [n1;n0]
+
+        mul     t0, n0, q
+        mul     t1, n1, q
+        umulh   t2, n0, q
+        adds    t1, t1, t2
+        umulh   t2, n1, q
+        adc     t2, t2, xzr
+
+// Decrement k and load the next digit (note that d aliases to q)
+
+        sub     k, k, #1
+        ldr     d, [x, k, lsl #3]
+
+// Subtract [t3;t2;t1;t0] = x' - q * [n1;n0] = x - q * n_25519
+
+        subs    t0, d, t0
+        sbcs    t1, m0, t1
+        sbcs    t2, m1, t2
+        sbcs    t3, m2, xzr
+
+// If this borrows (CF = 0 because of inversion), add back n_25519.
+// The masked n3 digit exploits the fact that bit 60 of n1 is set.
+
+        csel    m0, n0, xzr, cc
+        csel    m1, n1, xzr, cc
+        adds    m0, t0, m0
+        and     m3, m1, #0x1000000000000000
+        adcs    m1, t1, m1
+        adcs    m2, t2, xzr
+        adc     m3, t3, m3
+
+        cbnz    k, loop
+
+// Finally write back [m3;m2;m1;m0] and return
+
+writeback:
+        stp     m0, m1, [z]
+        stp     m2, m3, [z, #16]
+        ret
+
+// Short case: just copy the input with zero-padding
+
+short:
+        mov     m0, xzr
+        mov     m1, xzr
+        mov     m2, xzr
+        mov     m3, xzr
+
+        cbz     k, writeback
+        ldr     m0, [x]
+        subs    k, k, #1
+        beq     writeback
+        ldr     m1, [x, #8]
+        subs    k, k, #1
+        beq     writeback
+        ldr     m2, [x, #16]
+        b       writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif